Release: v4.12.2

Fixing image segmentation with inference mode. (#14204 )
* Fixing image segmentation for inference mode. * Update src/transformers/pipelines/base.py Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com> Co-authored-by: Patrick von Platen <patrick.v.platen@gmail.com>
2021-10-29 14:48:05 -04:00 · 2021-10-29 14:47:39 -04:00 · 2021-10-29 13:45:16 -04:00 · 2021-10-29 13:44:46 -04:00 · 2021-10-28 12:09:49 -04:00 · 2021-10-28 18:01:00 +02:00
793 changed files with 100494 additions and 13877 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,8 +80,8 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
@@ -97,6 +97,37 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_tests_torch_and_tf_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        environment:
+            OMP_NUM_THREADS: 1
+            RUN_PT_TF_CROSS_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch_and_tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+            - save_cache:
+                key: v0.4-{{ checksum "setup.py" }}
+                paths:
+                    - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf tests -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_torch_and_flax:
        working_directory: ~/transformers
@@ -116,8 +147,8 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
@@ -133,6 +164,37 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_tests_torch_and_flax_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        environment:
+            OMP_NUM_THREADS: 1
+            RUN_PT_FLAX_CROSS_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch_and_flax-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,flax,torch,testing,sentencepiece,torch-speech,vision]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+            - save_cache:
+                key: v0.4-{{ checksum "setup.py" }}
+                paths:
+                    - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax tests -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_torch:
        working_directory: ~/transformers
@@ -151,8 +213,8 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision,timm]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
@@ -168,6 +230,36 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_tests_torch_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+            - save_cache:
+                  key: v0.4-torch-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 3 --dist=loadfile -s --make-reports=tests_torch tests | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_tf:
        working_directory: ~/transformers
@@ -185,7 +277,7 @@ jobs:
                      - v0.4-tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech]
            - save_cache:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
@@ -201,6 +293,34 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_tests_tf_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech]
+            - save_cache:
+                  key: v0.4-tf-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_tf tests | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_flax:
        working_directory: ~/transformers
@@ -218,7 +338,7 @@ jobs:
                    - v0.4-flax-{{ checksum "setup.py" }}
                    - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: sudo pip install .[flax,testing,sentencepiece]
+            - run: sudo pip install .[flax,testing,sentencepiece,flax-speech,vision]
            - save_cache:
                  key: v0.4-flax-{{ checksum "setup.py" }}
                  paths:
@@ -234,6 +354,34 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_tests_flax_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                keys:
+                    - v0.4-flax-{{ checksum "setup.py" }}
+                    - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: sudo pip install .[flax,testing,sentencepiece,vision,flax-speech]
+            - save_cache:
+                  key: v0.4-flax-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_flax tests | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_pipelines_torch:
        working_directory: ~/transformers
@@ -253,8 +401,8 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing,sentencepiece,speech,vision]
-            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cpu.html
+            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
@@ -270,6 +418,37 @@ jobs:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_tests_pipelines_torch_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            RUN_PIPELINE_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,torch,testing,sentencepiece,torch-speech,vision,timm]
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+            - save_cache:
+                  key: v0.4-torch-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test tests | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_pipelines_tf:
        working_directory: ~/transformers
@@ -305,6 +484,35 @@ jobs:
            - store_artifacts:
                  path: ~/transformers/reports

+    run_tests_pipelines_tf_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            RUN_PIPELINE_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
+            - save_cache:
+                  key: v0.4-tf-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf tests -m is_pipeline_test | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
    run_tests_custom_tokenizers:
        working_directory: ~/transformers
        docker:
@@ -349,24 +557,55 @@ jobs:
                  keys:
                      - v0.4-torch_examples-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,sentencepiece,testing]
+            - run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
            - run: pip install -r examples/pytorch/_tests_requirements.txt
            - save_cache:
                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - run: python utils/tests_fetcher.py --filters examples tests | tee test_preparation.txt
            - store_artifacts:
                  path: ~/transformers/test_preparation.txt
            - run: |
                  if [ -f test_list.txt ]; then
-                    TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee examples_output.txt
+                    python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee tests_output.txt
                  fi
            - store_artifacts:
                  path: ~/transformers/examples_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_examples_torch_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.6
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch_examples-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,torch,sentencepiece,testing,torch-speech]
+            - run: pip install -r examples/pytorch/_tests_requirements.txt
+            - save_cache:
+                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee examples_output.txt
+            - store_artifacts:
+                  path: ~/transformers/examples_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_hub:
        working_directory: ~/transformers
@@ -399,8 +638,45 @@ jobs:
                  path: ~/transformers/test_preparation.txt
            - run: |
                  if [ -f test_list.txt ]; then
-                    python -m pytest -sv $(cat test_list.txt) -m is_staging_test
+                    python -m pytest -sv --make-reports=tests_hub $(cat test_list.txt) -m is_staging_test | tee tests_output.txt
                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+    
+    run_tests_hub_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            HUGGINGFACE_CO_STAGING: yes
+            RUN_GIT_LFS_TESTS: yes
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-hub-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get install git-lfs
+            - run: |
+                git config --global user.email "ci@dummy.com"
+                git config --global user.name "ci"
+            - run: pip install --upgrade pip
+            - run: pip install .[torch,sentencepiece,testing]
+            - save_cache:
+                  key: v0.4-hub-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -sv --make-reports=tests_hub tests -m is_staging_test | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports

    run_tests_onnxruntime:
        working_directory: ~/transformers
@@ -428,12 +704,41 @@ jobs:
                  path: ~/transformers/test_preparation.txt
            - run: |
                  if [ -f test_list.txt ]; then
-                    python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch $(cat test_list.txt) -k onnx | tee tests_output.txt
+                    python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_onnx $(cat test_list.txt) -k onnx | tee tests_output.txt
                  fi
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
                  path: ~/transformers/reports
+    
+    run_tests_onnxruntime_all:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[torch,testing,sentencepiece,onnxruntime]
+            - save_cache:
+                  key: v0.4-onnx-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: |
+                  python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_onnx tests -k onnx | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
    build_doc:
        working_directory: ~/transformers
        docker:
@@ -448,6 +753,7 @@ jobs:
            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
            - run: pip install --upgrade pip
            - run: pip install ."[docs]"
+            - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
            - save_cache:
                  key: v0.4-build_doc-{{ checksum "setup.py" }}
                  paths:
@@ -524,6 +830,44 @@ jobs:
            - run: pip install requests
            - run: python ./utils/link_tester.py

+    run_tests_layoutlmv2:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: sudo apt-get -y update && sudo apt-get install -y libsndfile1-dev
+            - run: pip install --upgrade pip
+            - run: pip install .[torch,testing,vision]
+            - run: pip install torchvision
+            - run: python -m pip install 'git+https://github.com/facebookresearch/detectron2.git'
+            - run: sudo apt install tesseract-ocr
+            - run: pip install pytesseract
+            - save_cache:
+                  key: v0.4-torch-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 1 tests/*layoutlmv2* --dist=loadfile -s --make-reports=tests_layoutlmv2 --durations=100
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
 # TPU JOBS
    run_examples_tpu:
        docker:
@@ -578,7 +922,28 @@ workflows:
            - run_tests_onnxruntime
            - run_tests_hub
            - build_doc
+            - run_tests_layoutlmv2
            - deploy_doc: *workflow_filters
+    nightly:
+        triggers:
+            - schedule:
+                cron: "0 0 * * *"
+                filters:
+                    branches:
+                        only:
+                            - master
+        jobs:
+            - run_examples_torch_all
+            - run_tests_torch_and_tf_all
+            - run_tests_torch_and_flax_all
+            - run_tests_torch_all
+            - run_tests_tf_all
+            - run_tests_flax_all
+            - run_tests_pipelines_torch_all
+            - run_tests_pipelines_tf_all
+            - run_tests_onnxruntime_all
+            - run_tests_hub_all
+
 #    tpu_testing_jobs:
 #        triggers:
 #            - schedule:
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -67,4 +67,13 @@ deploy_doc "25dee4a" v4.6.0
 deploy_doc "7a6c9fa" v4.7.0
 deploy_doc "9252a51" v4.8.0
 deploy_doc "1366172" v4.8.1
-deploy_doc "96d1cfb"  # v4.8.2 Latest stable release
+deploy_doc "96d1cfb" v4.8.2
+deploy_doc "72aee83" v4.9.0
+deploy_doc "bff1c71" v4.9.1
+deploy_doc "41981a2" v4.9.2
+deploy_doc "39cb6f5" v4.10.0
+deploy_doc "28e2787" v4.10.1
+deploy_doc "dc193c9" v4.11.0
+deploy_doc "54f9d62" v4.11.1
+deploy_doc "7655f11" v4.11.2
+deploy_doc "65659a2"  # v4.11.3 Latest stable release
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -27,30 +27,38 @@ assignees: ''

 Models:

- albert, bert, xlm: @LysandreJik
- blenderbot, bart, marian, pegasus, encoderdecoder,  t5: @patrickvonplaten, @patil-suraj
- longformer, reformer, transfoxl, xlnet: @patrickvonplaten
- fsmt: @stas00
- funnel: @sgugger
- gpt2: @patrickvonplaten, @LysandreJik
- rag: @patrickvonplaten, @lhoestq
- tensorflow: @Rocketknight1
+- ALBERT, BERT, XLM, DeBERTa, DeBERTa-v2, ELECTRA, MobileBert, SqueezeBert: @LysandreJik
+- encoder-decoder models (For example, BlenderBot, BART, Marian, Pegasus, T5, ByT5): @patrickvonplaten, @patil-suraj
+- Longformer, Reformer, TransfoXL, XLNet, FNet: @patrickvonplaten
+- FSMT: @stas00
+- Funnel: @sgugger
+- GPT-2, GPT: @patrickvonplaten, @LysandreJik
+- RAG, DPR: @patrickvonplaten, @lhoestq
+- TensorFlow: @Rocketknight1
+- JAX/Flax: @patil-suraj @patrickvonplaten 
+- TAPAS, LayoutLM, LayoutLMv2, LUKE, ViT, BEiT, DEiT, DETR, CANINE: @NielsRogge
+- GPT-Neo, GPT-J, CLIP: @patil-suraj
+- Wav2Vec2, HuBERT, SpeechEncoderDecoder: @patrickvonplaten, @anton-l
+
+If the model isn't in the list, ping @LysandreJik who will redirect you to the correct contributor.

 Library:

- benchmarks: @patrickvonplaten
- deepspeed: @stas00
- ray/raytune: @richardliaw, @amogkam
- text generation: @patrickvonplaten
- tokenizers: @LysandreJik
- trainer: @sgugger
- pipelines: @LysandreJik
+- Benchmarks: @patrickvonplaten
+- Deepspeed: @stas00
+- Ray/raytune: @richardliaw, @amogkam
+- Text generation: @patrickvonplaten
+- Tokenizers: @LysandreJik
+- Trainer: @sgugger
+- Pipelines: @Narsil
+- Speech: @patrickvonplaten, @anton-l
+- Vision: @NielsRogge, @sgugger

 Documentation: @sgugger

 Model hub:

- for issues with a model report at https://discuss.huggingface.co/ and tag the model's creator.
+- for issues with a model, report at https://discuss.huggingface.co/ and tag the model's creator.

 HF projects:

@@ -60,6 +68,9 @@ HF projects:
 Examples:

 - maintained examples (not research project or legacy): @sgugger, @patil-suraj
+
+For research projetcs, please ping the contributor directly. For example, on the following projects:
+
 - research_projects/bert-loses-patience: @JetRunner
 - research_projects/distillation: @VictorSanh

--- a/.github/workflows/doctests.yml
+++ b/.github/workflows/doctests.yml
@@ -0,0 +1,42 @@
+name: Doctests
+
+on:
+  push:
+    branches:
+      - doctest*
+  repository_dispatch:
+  schedule:
+    - cron: "0 0 * * *"
+
+
+env:
+  HF_HOME: /mnt/cache
+  TRANSFORMERS_IS_CI: yes
+  RUN_SLOW: yes
+  OMP_NUM_THREADS: 16
+  MKL_NUM_THREADS: 16
+  PYTEST_TIMEOUT: 600
+
+jobs:
+  run_doctests:
+    runs-on: [self-hosted, docker-gpu, single-gpu]
+    container:
+      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          apt -y update && apt install -y libsndfile1-dev
+          pip install --upgrade pip
+          pip install .[dev]
+
+      - name: Run doctests
+        run: |
+          pytest --doctest-modules $(cat utils/documentation_tests.txt) -sv --doctest-continue-on-failure
--- a/.github/workflows/model-templates.yml
+++ b/.github/workflows/model-templates.yml
@@ -36,7 +36,7 @@ jobs:

      - name: Install dependencies
        run: |
-          pip install --upgrade pip
+          pip install --upgrade pip!=21.3
          sudo apt -y update && sudo apt install -y libsndfile1-dev
          pip install .[dev]
      - name: Create model files
@@ -47,6 +47,8 @@ jobs:
          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/flax-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/flax-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
          make style
          python utils/check_table.py --fix_and_overwrite
          python utils/check_dummies.py --fix_and_overwrite
@@ -59,7 +61,7 @@ jobs:
      - name: Run style changes
        run: |
          git fetch origin master:master
-          make fixup
+          make style && make quality

      - name: Failure short reports
        if: ${{ always() }}
--- a/.github/workflows/self-nightly-scheduled.yml
+++ b/.github/workflows/self-nightly-scheduled.yml
@@ -0,0 +1,257 @@
+name: Self-hosted runner; Nightly (scheduled)
+
+on:
+    push:
+        branches:
+            - nightly_ci*
+    repository_dispatch:
+    schedule:
+        - cron: "0 0 */3 * *"
+
+env:
+    HF_HOME: /mnt/cache
+    TRANSFORMERS_IS_CI: yes
+    RUN_SLOW: yes
+    OMP_NUM_THREADS: 16
+    MKL_NUM_THREADS: 16
+    PYTEST_TIMEOUT: 600
+    SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}
+
+jobs:
+    run_all_tests_torch_gpu:
+        runs-on: [self-hosted, docker-gpu, single-gpu]
+        container:
+            image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
+            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        steps:
+            - name: Launcher docker
+              uses: actions/checkout@v2
+
+            - name: NVIDIA-SMI
+              run: |
+                  nvidia-smi
+
+            - name: Install dependencies
+              run: |
+                  apt -y update && apt install -y libsndfile1-dev git
+                  pip install --upgrade pip
+                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
+
+            - name: Are GPUs recognized by our DL frameworks
+              run: |
+                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
+                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+            - name: Run all tests on GPU
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_gpu_failures_short.txt
+
+            - name: Run examples tests on GPU
+              if: ${{ always() }}
+              env:
+                  OMP_NUM_THREADS: 16
+                  MKL_NUM_THREADS: 16
+                  RUN_SLOW: yes
+                  HF_HOME: /mnt/cache
+                  TRANSFORMERS_IS_CI: yes
+              run: |
+                  pip install -r examples/pytorch/_tests_requirements.txt
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/examples_torch_gpu_failures_short.txt
+
+            - name: Run all pipeline tests on GPU
+              if: ${{ always() }}
+              env:
+                  RUN_PIPELINE_TESTS: yes
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_pipeline_gpu_failures_short.txt
+
+            - name: Test suite reports artifacts
+              if: ${{ always() }}
+              uses: actions/upload-artifact@v2
+              with:
+                  name: run_all_tests_torch_gpu_test_reports
+                  path: reports
+
+    run_all_tests_torch_multi_gpu:
+        runs-on: [self-hosted, docker-gpu, multi-gpu]
+        container:
+            image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
+            options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        steps:
+            - name: Launcher docker
+              uses: actions/checkout@v2
+
+            - name: NVIDIA-SMI
+              continue-on-error: true
+              run: |
+                  nvidia-smi
+
+            - name: Install dependencies
+              run: |
+                  apt -y update && apt install -y libsndfile1-dev git
+                  pip install --upgrade pip
+                  pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
+
+            - name: Are GPUs recognized by our DL frameworks
+              run: |
+                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
+                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+            - name: Run all tests on GPU
+              env:
+                  MKL_SERVICE_FORCE_INTEL: 1
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_multi_gpu_failures_short.txt
+
+            - name: Run all pipeline tests on GPU
+              if: ${{ always() }}
+              env:
+                  RUN_PIPELINE_TESTS: yes
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt
+
+            - name: Test suite reports artifacts
+              if: ${{ always() }}
+              uses: actions/upload-artifact@v2
+              with:
+                  name: run_all_tests_torch_multi_gpu_test_reports
+                  path: reports
+
+    run_all_tests_torch_cuda_extensions_gpu:
+        runs-on: [self-hosted, docker-gpu, single-gpu]
+        container:
+            image: nvcr.io/nvidia/pytorch:21.03-py3
+            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        steps:
+            - name: Launcher docker
+              uses: actions/checkout@v2
+
+            - name: NVIDIA-SMI
+              run: |
+                  nvidia-smi
+
+            - name: Install dependencies
+              run: |
+                  apt -y update && apt install -y libaio-dev
+                  pip install --upgrade pip
+                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
+                  pip install .[testing,deepspeed]
+                  pip install git+https://github.com/microsoft/DeepSpeed
+
+            - name: Are GPUs recognized by our DL frameworks
+              run: |
+                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
+                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+            - name: Run all tests on GPU
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt
+
+            - name: Test suite reports artifacts
+              if: ${{ always() }}
+              uses: actions/upload-artifact@v2
+              with:
+                  name: run_tests_torch_cuda_extensions_gpu_test_reports
+                  path: reports
+
+    run_all_tests_torch_cuda_extensions_multi_gpu:
+        runs-on: [self-hosted, docker-gpu, multi-gpu]
+        container:
+            image: nvcr.io/nvidia/pytorch:21.03-py3
+            options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+        steps:
+            - name: Launcher docker
+              uses: actions/checkout@v2
+
+            - name: NVIDIA-SMI
+              continue-on-error: true
+              run: |
+                  nvidia-smi
+
+            - name: Install dependencies
+              run: |
+                  apt -y update && apt install -y libaio-dev
+                  pip install --upgrade pip
+                  pip install --pre torch torchvision torchaudio -f https://download.pytorch.org/whl/nightly/cu111/torch_nightly.html -U
+                  pip install .[testing,deepspeed,fairscale]
+                  pip install git+https://github.com/microsoft/DeepSpeed
+
+            - name: Are GPUs recognized by our DL frameworks
+              run: |
+                  python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+                  python -c "import torch; print('Cuda version:', torch.version.cuda)"
+                  python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
+                  python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+            - name: Run all tests on GPU
+              run: |
+                  python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
+ 
+            - name: Failure short reports
+              if: ${{ always() }}
+              run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt
+
+            - name: Test suite reports artifacts
+              if: ${{ always() }}
+              uses: actions/upload-artifact@v2
+              with:
+                  name: run_tests_torch_cuda_extensions_multi_gpu_test_reports
+                  path: reports
+
+    send_results:
+        name: Send results to webhook
+        runs-on: ubuntu-latest
+        if: always()
+        needs: [
+                run_all_tests_torch_gpu,
+                run_all_tests_torch_multi_gpu,
+                run_all_tests_torch_cuda_extensions_gpu,
+                run_all_tests_torch_cuda_extensions_multi_gpu
+        ]
+        steps:
+            - uses: actions/checkout@v2
+
+            - uses: actions/download-artifact@v2
+
+            - name: Send message to Slack
+              env:
+                  CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
+                  CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+                  CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}
+                  CI_SLACK_CHANNEL_ID_PAST_FUTURE: ${{ secrets.CI_SLACK_CHANNEL_ID_PAST_FUTURE }}
+
+              run: |
+                  pip install slack_sdk
+                  python utils/notification_service.py scheduled nightly-torch
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -11,6 +11,7 @@ on:
      - "tests/**"
      - ".github/**"
      - "templates/**"
+      - "utils/**"
  repository_dispatch:

 env:
@@ -27,32 +28,47 @@ jobs:
      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
+      - name: Install dependencies
+        run: |
+          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+          apt install -y libsndfile1-dev
+          pip install --upgrade pip
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+
      - name: Launcher docker
        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2

      - name: NVIDIA-SMI
        run: |
          nvidia-smi

-      - name: Install dependencies
-        run: |
-          apt -y update && apt install -y libsndfile1-dev
-          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
-
      - name: Are GPUs recognized by our DL frameworks
        run: |
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+      
+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt

      - name: Run all non-slow tests on GPU
        run: |
-          python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_gpu tests
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_gpu $(cat test_list.txt)
+          fi

      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
        run: cat reports/tests_torch_gpu_failures_short.txt

      - name: Test suite reports artifacts
@@ -62,6 +78,61 @@ jobs:
          name: run_all_tests_torch_gpu_test_reports
          path: reports

+  run_tests_flax_gpu:
+    runs-on: [self-hosted, docker-gpu-test, single-gpu]
+    container:
+      image: tensorflow/tensorflow:2.4.1-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Install dependencies
+        run: |
+          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
+          pip install --upgrade pip
+          pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
+
+      - name: Launcher docker
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2
+
+      - name: NVIDIA-SMI
+        continue-on-error: true
+        run: |
+          nvidia-smi
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
+          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
+      
+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt
+
+      - name: Run all non-slow tests on GPU
+        run: |
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_gpu $(cat test_list.txt)
+          fi
+
+      - name: Failure short reports
+        if: ${{ failure() }}
+        run: cat reports/tests_flax_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_flax_gpu_test_reports
+          path: reports
+
 #  run_tests_tf_gpu:
 #    runs-on: [self-hosted, docker-gpu, single-gpu]
 #    timeout-minutes: 120
@@ -69,32 +140,47 @@ jobs:
 #      image: tensorflow/tensorflow:2.4.1-gpu
 #      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 #    steps:
+#      - name: Install dependencies
+#        run: |
+#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+#          pip install --upgrade pip
+#          pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
+#
 #      - name: Launcher docker
 #        uses: actions/checkout@v2
+#        with:
+#          fetch-depth: 2
 #
 #      - name: NVIDIA-SMI
 #        run: |
 #          nvidia-smi
 #
-#      - name: Install dependencies
-#        run: |
-#          pip install --upgrade pip
-#          pip install .[sklearn,testing,onnxruntime,sentencepiece]
-#
 #      - name: Are GPUs recognized by our DL frameworks
 #        run: |
 #          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
 #          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
 #
+#      - name: Fetch the tests to run
+#        run: |
+#          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+#
+#      - name: Report fetched tests
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: test_fetched
+#          path: test_preparation.txt
+#
 #      - name: Run all non-slow tests on GPU
 #        env:
 #          TF_NUM_INTRAOP_THREADS: 8
 #          TF_NUM_INTEROP_THREADS: 1
 #        run: |
-#          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests
+#          if [ -f test_list.txt ]; then
+#            python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu $(cat test_list.txt)
+#          fi
 #
 #      - name: Failure short reports
-#        if: ${{ always() }}
+#        if: ${{ failure() }}
 #        run: cat reports/tests_tf_gpu_failures_short.txt
 #
 #      - name: Test suite reports artifacts
@@ -111,18 +197,22 @@ jobs:
      image: pytorch/pytorch:1.9.0-cuda11.1-cudnn8-runtime
      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+          apt install -y libsndfile1-dev
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
+          pip install .[sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]
+      
+      - name: Launcher docker
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2
+
+      - name: NVIDIA-SMI
+        continue-on-error: true
+        run: |
+          nvidia-smi

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -131,14 +221,26 @@ jobs:
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt
+
      - name: Run all non-slow tests on GPU
        env:
          MKL_SERVICE_FORCE_INTEL: 1
        run: |
-          python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_multi_gpu tests
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_multi_gpu $(cat test_list.txt)
+          fi

      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
        run: cat reports/tests_torch_multi_gpu_failures_short.txt

      - name: Test suite reports artifacts
@@ -148,6 +250,61 @@ jobs:
          name: run_all_tests_torch_multi_gpu_test_reports
          path: reports

+#  run_tests_flax_multi_gpu:
+#    runs-on: [self-hosted, docker-gpu, multi-gpu]
+#    container:
+#      image: tensorflow/tensorflow:2.4.1-gpu
+#      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#    steps:
+#      - name: Install dependencies
+#        run: |
+#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+#          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
+#          pip install --upgrade pip
+#          pip install .[sklearn,testing,sentencepiece,flax,flax-speech,vision]
+#
+#      - name: Launcher docker
+#        uses: actions/checkout@v2
+#        with:
+#          fetch-depth: 2
+#
+#      - name: NVIDIA-SMI
+#        continue-on-error: true
+#        run: |
+#          nvidia-smi
+#
+#      - name: Are GPUs recognized by our DL frameworks
+#        run: |
+#          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
+#          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
+#      
+#      - name: Fetch the tests to run
+#        run: |
+#          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+#
+#      - name: Report fetched tests
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: test_fetched
+#          path: test_preparation.txt
+#
+#      - name: Run all non-slow tests on GPU
+#        run: |
+#          if [ -f test_list.txt ]; then
+#            python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_flax_multi_gpu $(cat test_list.txt)
+#          fi
+#
+#      - name: Failure short reports
+#        if: ${{ failure() }}
+#        run: cat reports/tests_flax_multi_gpu_failures_short.txt
+#
+#      - name: Test suite reports artifacts
+#        if: ${{ always() }}
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: run_all_tests_flax_multi_gpu_test_reports
+#          path: reports
+
 #  run_tests_tf_multi_gpu:
 #    runs-on: [self-hosted, docker-gpu, multi-gpu]
 #    timeout-minutes: 120
@@ -155,32 +312,47 @@ jobs:
 #      image: tensorflow/tensorflow:2.4.1-gpu
 #      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
 #    steps:
+#      - name: Install dependencies
+#        run: |
+#          apt -y update && apt install -y software-properties-common && apt -y update && add-apt-repository -y ppa:git-core/ppa && apt -y update && apt install -y git
+#          pip install --upgrade pip
+#          pip install .[sklearn,testing,onnxruntime,sentencepiece,tf-speech]
+#
 #      - name: Launcher docker
 #        uses: actions/checkout@v2
+#        with:
+#          fetch-depth: 2
 #
 #      - name: NVIDIA-SMI
 #        run: |
 #          nvidia-smi
 #
-#      - name: Install dependencies
-#        run: |
-#          pip install --upgrade pip
-#          pip install .[sklearn,testing,onnxruntime,sentencepiece]
-#
 #      - name: Are GPUs recognized by our DL frameworks
 #        run: |
 #          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
 #          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
 #
+#      - name: Fetch the tests to run
+#        run: |
+#          python utils/tests_fetcher.py --diff_with_last_commit | tee test_preparation.txt
+#
+#      - name: Report fetched tests
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: test_fetched
+#          path: test_preparation.txt
+#
 #      - name: Run all non-slow tests on GPU
 #        env:
 #          TF_NUM_INTRAOP_THREADS: 8
 #          TF_NUM_INTEROP_THREADS: 1
 #        run: |
-#          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
+#          if [ -f test_list.txt ]; then
+#            python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu $(cat test_list.txt)
+#          fi
 #
 #      - name: Failure short reports
-#        if: ${{ always() }}
+#        if: ${{ failure() }}
 #        run: cat reports/tests_tf_multi_gpu_failures_short.txt
 #
 #      - name: Test suite reports artifacts
@@ -198,6 +370,8 @@ jobs:
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2

      - name: NVIDIA-SMI
        run: |
@@ -215,13 +389,25 @@ jobs:
          python -c "import torch; print('Cuda version:', torch.version.cuda)"
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+      
+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt
+      
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt

      - name: Run all tests on GPU
        run: |
-          python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_gpu $(cat test_list.txt)
+          fi

      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
        run: cat reports/tests_torch_cuda_extensions_gpu_failures_short.txt

      - name: Test suite reports artifacts
@@ -239,8 +425,11 @@ jobs:
    steps:
      - name: Launcher docker
        uses: actions/checkout@v2
+        with:
+          fetch-depth: 2

      - name: NVIDIA-SMI
+        continue-on-error: true
        run: |
          nvidia-smi

@@ -257,12 +446,24 @@ jobs:
          python -c "import torch; print('CuDNN version:', torch.backends.cudnn.version())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

+      - name: Fetch the tests to run
+        run: |
+          python utils/tests_fetcher.py --diff_with_last_commit --filters tests/deepspeed tests/extended | tee test_preparation.txt
+
+      - name: Report fetched tests
+        uses: actions/upload-artifact@v2
+        with:
+          name: test_fetched
+          path: test_preparation.txt
+
      - name: Run all tests on GPU
        run: |
-          python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
+          if [ -f test_list.txt ]; then
+            python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_multi_gpu $(cat test_list.txt)
+          fi

      - name: Failure short reports
-        if: ${{ always() }}
+        if: ${{ failure() }}
        run: cat reports/tests_torch_cuda_extensions_multi_gpu_failures_short.txt

      - name: Test suite reports artifacts
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -15,6 +15,7 @@ env:
  OMP_NUM_THREADS: 16
  MKL_NUM_THREADS: 16
  PYTEST_TIMEOUT: 600
+  SIGOPT_API_TOKEN: ${{ secrets.SIGOPT_API_TOKEN }}

 jobs:
  run_all_tests_torch_gpu:
@@ -32,9 +33,9 @@ jobs:

      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
-          pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
+          pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -85,6 +86,46 @@ jobs:
          name: run_all_tests_torch_gpu_test_reports
          path: reports

+  run_all_tests_flax_gpu:
+    runs-on: [self-hosted, docker-gpu-test, single-gpu]
+    container:
+      image: tensorflow/tensorflow:2.4.1-gpu
+      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: NVIDIA-SMI
+        continue-on-error: true
+        run: |
+          nvidia-smi
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
+          pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
+          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
+
+      - name: Run all tests on GPU
+        run: |
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_flax_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_flax_gpu_test_reports
+          path: reports
+
  run_all_tests_tf_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
    container:
@@ -100,8 +141,9 @@ jobs:

      - name: Install dependencies
        run: |
+          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnx,sentencepiece]
+          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -139,6 +181,45 @@ jobs:
          name: run_all_tests_tf_gpu_test_reports
          path: reports

+  run_all_examples_torch_xla_tpu:
+    runs-on: [self-hosted, docker-tpu-test, tpu-v3-8]
+    container:
+      image: gcr.io/tpu-pytorch/xla:nightly_3.8_tpuvm
+      options: --privileged -v "/lib/libtpu.so:/lib/libtpu.so" -v /mnt/cache/.cache/huggingface:/mnt/cache/ --shm-size 16G
+    steps:
+      - name: Launcher docker
+        uses: actions/checkout@v2
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .[testing]
+
+      - name: Are TPUs recognized by our DL frameworks
+        env:
+          XRT_TPU_CONFIG: localservice;0;localhost:51011
+        run: |
+          python -c "import torch_xla.core.xla_model as xm; print(xm.xla_device())"
+
+      - name: Run example tests on TPU
+        env:
+          XRT_TPU_CONFIG: "localservice;0;localhost:51011"
+          MKL_SERVICE_FORCE_INTEL: "1"  # See: https://github.com/pytorch/pytorch/issues/37377
+
+        run: |
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_xla_tpu examples/pytorch/test_xla_examples.py
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_xla_tpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_examples_torch_xla_tpu
+          path: reports
+
  run_all_tests_torch_multi_gpu:
    runs-on: [self-hosted, docker-gpu, multi-gpu]
    container:
@@ -149,14 +230,15 @@ jobs:
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
+        continue-on-error: true
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
-          apt -y update && apt install -y libsndfile1-dev
+          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
-          pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,speech,vision,timm]
+          pip install .[integrations,sklearn,testing,onnxruntime,sentencepiece,torch-speech,vision,timm]

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -203,13 +285,15 @@ jobs:
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
+        continue-on-error: true
        run: |
          nvidia-smi

      - name: Install dependencies
        run: |
+          apt -y update && apt install -y libsndfile1-dev git
          pip install --upgrade pip
-          pip install .[sklearn,testing,onnx,sentencepiece]
+          pip install .[sklearn,testing,onnx,sentencepiece,tf-speech]

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -247,6 +331,45 @@ jobs:
          name: run_all_tests_tf_multi_gpu_test_reports
          path: reports

+#  run_all_tests_flax_multi_gpu:
+#    runs-on: [self-hosted, docker-gpu, multi-gpu]
+#    container:
+#      image: tensorflow/tensorflow:2.4.1-gpu
+#      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#    steps:
+#      - name: Launcher docker
+#        uses: actions/checkout@v2
+#
+#      - name: NVIDIA-SMI
+#        run: |
+#          nvidia-smi
+#
+#      - name: Install dependencies
+#        run: |
+#          pip install --upgrade pip
+#          pip install --upgrade "jax[cuda111]" -f https://storage.googleapis.com/jax-releases/jax_releases.html
+#          pip install .[flax,integrations,sklearn,testing,sentencepiece,flax-speech,vision]
+#
+#      - name: Are GPUs recognized by our DL frameworks
+#        run: |
+#          python -c "from jax.lib import xla_bridge; print('GPU available:', xla_bridge.get_backend().platform)"
+#          python -c "import jax; print('Number of GPUs available:', len(jax.local_devices()))"
+#
+#      - name: Run all tests on GPU
+#        run: |
+#          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_flax_gpu tests
+#
+#      - name: Failure short reports
+#        if: ${{ always() }}
+#        run: cat reports/tests_flax_gpu_failures_short.txt
+#
+#      - name: Test suite reports artifacts
+#        if: ${{ always() }}
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: run_all_tests_flax_gpu_test_reports
+#          path: reports
+
  run_all_tests_torch_cuda_extensions_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
    container:
@@ -298,6 +421,7 @@ jobs:
        uses: actions/checkout@v2

      - name: NVIDIA-SMI
+        continue-on-error: true
        run: |
          nvidia-smi

@@ -350,6 +474,7 @@ jobs:
        env:
          CI_SLACK_BOT_TOKEN: ${{ secrets.CI_SLACK_BOT_TOKEN }}
          CI_SLACK_CHANNEL_ID: ${{ secrets.CI_SLACK_CHANNEL_ID }}
+          CI_SLACK_CHANNEL_ID_DAILY: ${{ secrets.CI_SLACK_CHANNEL_ID_DAILY }}


        run: |
--- a/CITATION.cff
+++ b/CITATION.cff
@@ -0,0 +1,82 @@
+cff-version: "1.2.0"
+date-released: 2020-10
+message: "If you use this software, please cite it using these metadata."
+title: "Transformers: State-of-the-Art Natural Language Processing"
+url: "https://github.com/huggingface/transformers"
+authors: 
+  - family-names: Wolf
+    given-names: Thomas
+  - family-names: Debut
+    given-names: Lysandre
+  - family-names: Sanh
+    given-names: Victor
+  - family-names: Chaumond
+    given-names: Julien
+  - family-names: Delangue
+    given-names: Clement
+  - family-names: Moi
+    given-names: Anthony
+  - family-names: Cistac
+    given-names: Perric
+  - family-names: Ma
+    given-names: Clara
+  - family-names: Jernite
+    given-names: Yacine
+  - family-names: Plu
+    given-names: Julien
+  - family-names: Xu
+    given-names: Canwen
+  - family-names: "Le Scao"
+    given-names: Teven
+  - family-names: Gugger
+    given-names: Sylvain
+  - family-names: Drame
+    given-names: Mariama
+  - family-names: Lhoest
+    given-names: Quentin
+  - family-names: Rush
+    given-names: "Alexander M."
+preferred-citation:
+  type: conference-paper
+  authors:
+  - family-names: Wolf
+    given-names: Thomas
+  - family-names: Debut
+    given-names: Lysandre
+  - family-names: Sanh
+    given-names: Victor
+  - family-names: Chaumond
+    given-names: Julien
+  - family-names: Delangue
+    given-names: Clement
+  - family-names: Moi
+    given-names: Anthony
+  - family-names: Cistac
+    given-names: Perric
+  - family-names: Ma
+    given-names: Clara
+  - family-names: Jernite
+    given-names: Yacine
+  - family-names: Plu
+    given-names: Julien
+  - family-names: Xu
+    given-names: Canwen
+  - family-names: "Le Scao"
+    given-names: Teven
+  - family-names: Gugger
+    given-names: Sylvain
+  - family-names: Drame
+    given-names: Mariama
+  - family-names: Lhoest
+    given-names: Quentin
+  - family-names: Rush
+    given-names: "Alexander M."
+  booktitle: "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations"
+  month: 10
+  start: 38
+  end: 45
+  title: "Transformers: State-of-the-Art Natural Language Processing"
+  year: 2020
+  publisher: "Association for Computational Linguistics"
+  url: "https://www.aclweb.org/anthology/2020.emnlp-demos.6"
+  address: "Online"
--- a/ISSUES.md
+++ b/ISSUES.md
@@ -205,7 +205,7 @@ You are not required to read the following guidelines before opening an issue. H

   If you really tried to make a short reproducible code but couldn't figure it out, it might be that having a traceback will give the developer enough information to know what's going on. But if it is not enough and we can't reproduce the problem, we can't really solve it.

-   Do not dispair if you can't figure it out from the begining, just share what you can and perhaps someone else will be able to help you at the forums.
+   Do not despair if you can't figure it out from the beginning, just share what you can and perhaps someone else will be able to help you at the forums.

   If your setup involves any custom datasets, the best way to help us reproduce the problem is to create a [Google Colab notebook](https://colab.research.google.com/) that demonstrates the issue and once you verify that the issue still exists, include a link to that notebook in the Issue. Just make sure that you don't copy and paste the location bar url of the open notebook - as this is private and we won't be able to open it. Instead, you need to click on `Share` in the right upper corner of the notebook, select `Get Link` and then copy and paste the public link it will give to you.

--- a/1
+++ b/1
@@ -30,7 +30,6 @@ deps_table_check_updated:
 # autogenerating code

 autogenerate_code: deps_table_update
-	python utils/class_mapping_update.py

 # Check that source code meets quality standards

--- a/README.md
+++ b/README.md
@@ -42,7 +42,8 @@ limitations under the License.
    <p>
        <b>English</b> |
        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hans.md">简体中文</a> |
-        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hant.md">繁體中文</a>
+        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hant.md">繁體中文</a> |
+        <a href="https://github.com/huggingface/transformers/blob/master/README_ko.md">한국어</a>
    <p>
 </h4>

@@ -211,7 +212,10 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
 1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](https://huggingface.co/transformers/model_doc/bartpho.html)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERTweet](https://huggingface.co/transformers/model_doc/bertweet.html)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
@@ -234,20 +238,25 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
 for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
 Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
 1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FNet](https://huggingface.co/transformers/model_doc/fnet.html)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
 1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
 1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
+1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
 1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
 1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
@@ -255,16 +264,28 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[PhoBERT](https://huggingface.co/transformers/model_doc/phobert.html)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SegFormer](https://huggingface.co/transformers/model_doc/segformer.html)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[SEW](https://huggingface.co/transformers/model_doc/sew.html)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](https://huggingface.co/transformers/model_doc/sew_d.html)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
 1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/model_doc/speech_to_text_2.html)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](https://huggingface.co/transformers/model_doc/trocr.html)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[UniSpeech](https://huggingface.co/transformers/model_doc/unispeech.html)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](https://huggingface.co/transformers/model_doc/unispeech_sat.html)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER
+AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
 1. **[VisualBERT](https://huggingface.co/transformers/model_doc/visual_bert.html)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
--- a/README_ko.md
+++ b/README_ko.md
@@ -0,0 +1,327 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<p align="center">
+    <br>
+    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
+    <br>
+<p>
+<p align="center">
+    <a href="https://circleci.com/gh/huggingface/transformers">
+        <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
+        <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
+    </a>
+    <a href="https://huggingface.co/transformers/index.html">
+        <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/transformers/index.html.svg?down_color=red&down_message=offline&up_message=online">
+    </a>
+    <a href="https://github.com/huggingface/transformers/releases">
+        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/master/CODE_OF_CONDUCT.md">
+        <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
+    </a>
+    <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
+</p>
+
+<h4 align="center">
+    <p>
+        <a href="https://github.com/huggingface/transformers/">English</a> |
+        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hans.md">简体中文</a> |
+        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hant.md">繁體中文</a> |
+        <b>한국어</b>
+    <p>
+</h4>
+
+<h3 align="center">
+    <p> Jax, Pytorch, TensorFlow를 위한 최첨단 자연어처리</p>
+</h3>
+
+<h3 align="center">
+    <a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/course_banner.png"></a>
+</h3>
+
+🤗 Transformers는 분류, 정보 추출, 질문 답변, 요약, 번역, 문장 생성 등을 100개 이상의 언어로 수행할 수 있는 수천개의 사전학습된 모델을 제공합니다. 우리의 목표는 모두가 최첨단의 NLP 기술을 쉽게 사용하는 것입니다.
+
+🤗 Transformers는 이러한 사전학습 모델을 빠르게 다운로드해 특정 텍스트에 사용하고, 원하는 데이터로 fine-tuning해 커뮤니티나 우리의 [모델 허브](https://huggingface.co/models)에 공유할 수 있도록 API를 제공합니다. 또한, 모델 구조를 정의하는 각 파이썬 모듈은 완전히 독립적이여서 연구 실험을 위해 손쉽게 수정할 수 있습니다.
+
+🤗 Transformers는 가장 유명한 3개의 딥러닝 라이브러리를 지원합니다. 이들은 서로 완벽히 연동됩니다 — [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/), [TensorFlow](https://www.tensorflow.org/). 간단하게 이 라이브러리 중 하나로 모델을 학습하고, 또 다른 라이브러리로 추론을 위해 모델을 불러올 수 있습니다. 
+
+## 온라인 데모
+
+대부분의 모델을 [모델 허브](https://huggingface.co/models) 페이지에서 바로 테스트해볼 수 있습니다. 공개 및 비공개 모델을 위한 [비공개 모델 호스팅, 버전 관리, 추론 API](https://huggingface.co/pricing)도 제공합니다.
+
+예시:
+- [BERT로 마스킹된 단어 완성하기](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [Electra를 이용한 개체명 인식](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [GPT-2로 텍스트 생성하기](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
+- [RoBERTa로 자연어 추론하기](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [BART를 이용한 요약](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [DistilBERT를 이용한 질문 답변](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [T5로 번역하기](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+
+**[Transformer와 글쓰기](https://transformer.huggingface.co)** 는 이 저장소의 텍스트 생성 능력에 관한 Hugging Face 팀의 공식 데모입니다. 
+
+## Hugging Face 팀의 커스텀 지원을 원한다면
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a><br>
+
+## 퀵 투어
+
+원하는 텍스트에 바로 모델을 사용할 수 있도록, 우리는 `pipeline` API를 제공합니다. Pipeline은 사전학습 모델과 그 모델을 학습할 때 적용한 전처리 방식을 하나로 합칩니다. 다음은 긍정적인 텍스트와 부정적인 텍스트를 분류하기 위해 pipeline을 사용한 간단한 예시입니다:
+
+```python
+>>> from transformers import pipeline
+
+# Allocate a pipeline for sentiment-analysis
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
+```
+
+코드의 두번째 줄은 pipeline이 사용하는 사전학습 모델을 다운로드하고 캐시로 저장합니다. 세번째 줄에선 그 모델이 주어진 텍스트를 평가합니다. 여기서 모델은 99.97%의 확률로 텍스트가 긍정적이라고 평가했습니다.
+
+많은 NLP 과제들을 `pipeline`으로 바로 수행할 수 있습니다. 예를 들어, 질문과 문맥이 주어지면 손쉽게 답변을 추출할 수 있습니다:
+
+``` python
+>>> from transformers import pipeline
+
+# Allocate a pipeline for question-answering
+>>> question_answerer = pipeline('question-answering')
+>>> question_answerer({
+...     'question': 'What is the name of the repository ?',
+...     'context': 'Pipeline has been included in the huggingface/transformers repository'
+... })
+{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}
+
+```
+
+답변뿐만 아니라, 여기에 사용된 사전학습 모델은 확신도와 토크나이즈된 문장 속 답변의 시작점, 끝점까지 반환합니다. [이 튜토리얼](https://huggingface.co/transformers/task_summary.html)에서 `pipeline` API가 지원하는 다양한 과제를 확인할 수 있습니다.
+
+코드 3줄로 원하는 과제에 맞게 사전학습 모델을 다운로드 받고 사용할 수 있습니다. 다음은 PyTorch 버전입니다:
+```python
+>>> from transformers import AutoTokenizer, AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = AutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+다음은 TensorFlow 버전입니다:
+```python
+>>> from transformers import AutoTokenizer, TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="tf")
+>>> outputs = model(**inputs)
+```
+
+토크나이저는 사전학습 모델의 모든 전처리를 책임집니다. 그리고 (위의 예시처럼) 1개의 스트링이나 리스트도 처리할 수 있습니다. 토크나이저는 딕셔너리를 반환하는데, 이는 다운스트림 코드에 사용하거나 언패킹 연산자 ** 를 이용해 모델에 바로 전달할 수도 있습니다.
+
+모델 자체는 일반적으로 사용되는 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module)나 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)입니다. [이 튜토리얼](https://huggingface.co/transformers/training.html)은 이러한 모델을 표준적인 PyTorch나 TensorFlow 학습 과정에서 사용하는 방법, 또는 새로운 데이터로 fine-tune하기 위해 `Trainer` API를 사용하는 방법을 설명해줍니다.
+
+## 왜 transformers를 사용해야 할까요?
+
+1. 손쉽게 사용할 수 있는 최첨단 모델:
+    - NLU와 NLG 과제에서 뛰어난 성능을 보입니다.
+    - 교육자 실무자에게 진입 장벽이 낮습니다.
+    - 3개의 클래스만 배우면 바로 사용할 수 있습니다.
+    - 하나의 API로 모든 사전학습 모델을 사용할 수 있습니다.
+
+1. 더 적은 계산 비용, 더 적은 탄소 발자국:
+    - 연구자들은 모델을 계속 다시 학습시키는 대신 학습된 모델을 공유할 수 있습니다.
+    - 실무자들은 학습에 필요한 시간과 비용을 절약할 수 있습니다.
+    - 수십개의 모델 구조, 2,000개 이상의 사전학습 모델, 100개 이상의 언어로 학습된 모델 등.
+
+1. 모델의 각 생애주기에 적합한 프레임워크:
+    - 코드 3줄로 최첨단 모델을 학습하세요.
+    - 자유롭게 모델을 TF2.0나 PyTorch 프레임워크로 변환하세요.
+    - 학습, 평가, 공개 등 각 단계에 맞는 프레임워크를 원하는대로 선택하세요.
+
+1. 필요한 대로 모델이나 예시를 커스터마이즈하세요:
+    - 우리는 저자가 공개한 결과를 재현하기 위해 각 모델 구조의 예시를 제공합니다.
+    - 모델 내부 구조는 가능한 일관적으로 공개되어 있습니다.
+    - 빠른 실험을 위해 모델 파일은 라이브러리와 독립적으로 사용될 수 있습니다.
+
+## 왜 transformers를 사용하지 말아야 할까요?
+
+- 이 라이브러리는 신경망 블록을 만들기 위한 모듈이 아닙니다. 연구자들이 여러 파일을 살펴보지 않고 바로 각 모델을 사용할 수 있도록, 모델 파일 코드의 추상화 수준을 적정하게 유지했습니다.
+- 학습 API는 모든 모델에 적용할 수 있도록 만들어지진 않았지만, 라이브러리가 제공하는 모델들에 적용할 수 있도록 최적화되었습니다. 일반적인 머신 러닝을 위해선, 다른 라이브러리를 사용하세요.
+- 가능한 많은 사용 예시를 보여드리고 싶어서, [예시 폴더](https://github.com/huggingface/transformers/tree/master/examples)의 스크립트를 준비했습니다. 이 스크립트들을 수정 없이 특정한 문제에 바로 적용하지 못할 수 있습니다. 필요에 맞게 일부 코드를 수정해야 할 수 있습니다.
+
+## 설치
+
+### pip로 설치하기
+
+이 저장소는 Python 3.6+, Flax 0.3.2+, PyTorch 1.3.1+, TensorFlow 2.3+에서 테스트 되었습니다.
+
+[가상 환경](https://docs.python.org/3/library/venv.html)에 🤗 Transformers를 설치하세요. Python 가상 환경에 익숙하지 않다면, [사용자 가이드](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)를 확인하세요.
+
+우선, 사용할 Python 버전으로 가상 환경을 만들고 실행하세요.
+
+그 다음, Flax, PyTorch, TensorFlow 중 적어도 하나는 설치해야 합니다.
+플랫폼에 맞는 설치 명령어를 확인하기 위해 [TensorFlow 설치 페이지](https://www.tensorflow.org/install/), [PyTorch 설치 페이지](https://pytorch.org/get-started/locally/#start-locally), [Flax 설치 페이지](https://github.com/google/flax#quick-install)를 확인하세요.
+
+이들 중 적어도 하나가 설치되었다면, 🤗 Transformers는 다음과 같이 pip을 이용해 설치할 수 있습니다:
+
+```bash
+pip install transformers
+```
+
+예시들을 체험해보고 싶거나, 최최최첨단 코드를 원하거나, 새로운 버전이 나올 때까지 기다릴 수 없다면 [라이브러리를 소스에서 바로 설치](https://huggingface.co/transformers/installation.html#installing-from-source)하셔야 합니다.
+
+### conda로 설치하기
+
+Transformers 버전 v4.0.0부터, conda 채널이 생겼습니다: `huggingface`.
+
+🤗 Transformers는 다음과 같이 conda로 설치할 수 있습니다:
+
+```shell script
+conda install -c huggingface transformers
+```
+
+Flax, PyTorch, TensorFlow 설치 페이지에서 이들을 conda로 설치하는 방법을 확인하세요.
+
+## 모델 구조
+
+**🤗 Transformers가 제공하는 [모든 모델 체크포인트](https://huggingface.co/models)** 는 huggingface.co [모델 허브](https://huggingface.co)에 완벽히 연동되어 있습니다. [개인](https://huggingface.co/users)과 [기관](https://huggingface.co/organizations)이 모델 허브에 직접 업로드할 수 있습니다.
+
+현재 사용 가능한 모델 체크포인트의 개수: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
+🤗 Transformers는 다음 모델들을 제공합니다 (각 모델의 요약은 [여기](https://huggingface.co/transformers/model_summary.html)서 확인하세요):
+
+1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](https://huggingface.co/transformers/model_doc/bartpho.html)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
+1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BERTweet](https://huggingface.co/transformers/model_doc/bertweet.html)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
+1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[ByT5](https://huggingface.co/transformers/model_doc/byt5.html)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CANINE](https://huggingface.co/transformers/model_doc/canine.html)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[CLIP](https://huggingface.co/transformers/model_doc/clip.html)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[CPM](https://huggingface.co/transformers/model_doc/cpm.html)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeiT](https://huggingface.co/transformers/model_doc/deit.html)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
+1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FNet](https://huggingface.co/transformers/model_doc/fnet.html)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
+1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (from EleutherAI) released in the repository [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
+1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](https://huggingface.co/transformers/model_doc/megatron_bert.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[PhoBERT](https://huggingface.co/transformers/model_doc/phobert.html)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
+1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
+1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SegFormer](https://huggingface.co/transformers/model_doc/segformer.html)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[SEW](https://huggingface.co/transformers/model_doc/sew.html)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](https://huggingface.co/transformers/model_doc/sew_d.html)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/model_doc/speech_to_text_2.html)** (from Facebook), released together with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (from Tel Aviv University), released together with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (from Google AI) released in the repository [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](https://huggingface.co/transformers/model_doc/trocr.html)** (from Microsoft), released together with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[UniSpeech](https://huggingface.co/transformers/model_doc/unispeech.html)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](https://huggingface.co/transformers/model_doc/unispeech_sat.html)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
+1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VisualBERT](https://huggingface.co/transformers/model_doc/visual_bert.html)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLSR-Wav2Vec2](https://huggingface.co/transformers/model_doc/xlsr_wav2vec2.html)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. 새로운 모델을 올리고 싶나요? 우리가 **상세한 가이드와 템플릿** 으로 새로운 모델을 올리도록 도와드릴게요. 가이드와 템플릿은 이 저장소의 [`templates`](./templates) 폴더에서 확인하실 수 있습니다. [컨트리뷰션 가이드라인](./CONTRIBUTING.md)을 꼭 확인해주시고, PR을 올리기 전에 메인테이너에게 연락하거나 이슈를 오픈해 피드백을 받으시길 바랍니다. 
+
+각 모델이 Flax, PyTorch, TensorFlow으로 구현되었는지 또는 🤗 Tokenizers 라이브러리가 지원하는 토크나이저를 사용하는지 확인하려면, [이 표](https://huggingface.co/transformers/index.html#supported-frameworks)를 확인하세요.
+
+이 구현은 여러 데이터로 검증되었고 (예시 스크립트를 참고하세요) 오리지널 구현의 성능과 같아야 합니다. [도큐먼트](https://huggingface.co/transformers/examples.html)의 Examples 섹션에서 성능에 대한 자세한 설명을 확인할 수 있습니다.
+
+## 더 알아보기
+
+| 섹션 | 설명 |
+|-|-|
+| [도큐먼트](https://huggingface.co/transformers/) | 전체 API 도큐먼트와 튜토리얼 |
+| [과제 요약](https://huggingface.co/transformers/task_summary.html) | 🤗 Transformers가 지원하는 과제들 |
+| [전처리 튜토리얼](https://huggingface.co/transformers/preprocessing.html) | `Tokenizer` 클래스를 이용해 모델을 위한 데이터 준비하기 |
+| [학습과 fine-tuning](https://huggingface.co/transformers/training.html) | 🤗 Transformers가 제공하는 모델 PyTorch/TensorFlow 학습 과정과 `Trainer` API에서 사용하기 |
+| [퀵 투어: Fine-tuning/사용 스크립트](https://github.com/huggingface/transformers/tree/master/examples) | 다양한 과제에서 모델 fine-tuning하는 예시 스크립트 |
+| [모델 공유 및 업로드](https://huggingface.co/transformers/model_sharing.html) | 커뮤니티에 fine-tune된 모델을 업로드 및 공유하기 |
+| [마이그레이션](https://huggingface.co/transformers/migration.html) | `pytorch-transformers`나 `pytorch-pretrained-bert`에서 🤗 Transformers로 이동하기|
+
+## 인용
+
+🤗 Transformers 라이브러리를 인용하고 싶다면, 이 [논문](https://www.aclweb.org/anthology/2020.emnlp-demos.6/)을 인용해 주세요:
+```bibtex
+@inproceedings{wolf-etal-2020-transformers,
+    title = "Transformers: State-of-the-Art Natural Language Processing",
+    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = oct,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
+    pages = "38--45"
+}
+```
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -67,7 +67,8 @@ checkpoint: 检查点
    <p>
        <a href="https://github.com/huggingface/transformers/">English</a> |
        <b>简体中文</b> |
-        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hant.md">繁體中文</a>
+        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hant.md">繁體中文</a> |
+        <a href="https://github.com/huggingface/transformers/blob/master/README_ko.md">한국어</a>
    <p>
 </h4>

@@ -235,10 +236,13 @@ conda install -c huggingface transformers
 1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。
 1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
 1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
+1. **[BARTpho](https://huggingface.co/transformers/model_doc/bartpho.html)** (来自 VinAI Research) 伴随论文 [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) 由 Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen 发布。
+1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (来自 Microsoft) 伴随论文 [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) 由 Hangbo Bao, Li Dong, Furu Wei 发布。
 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (来自 Google) 伴随论文 [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) 由 Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova 发布。
 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (来自 Google) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
-1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
+1. **[BERTweet](https://huggingface.co/transformers/model_doc/bertweet.html)** (来自 VinAI Research) 伴随论文 [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) 由 Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen 发布。
 1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
+1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
 1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
 1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。
@@ -255,18 +259,21 @@ conda install -c huggingface transformers
 1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。
 1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
 1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) 和德语版 DistilBERT。
-1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval
-for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon
-Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
+1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
 1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
+1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (来自 Google Research) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
 1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
+1. **[FNet](https://huggingface.co/transformers/model_doc/fnet.html)** (来自 Google Research) 伴随论文 [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) 由 James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon 发布。
 1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
 1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
-1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
 1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
+1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
+1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (来自 EleutherAI) 伴随论文 [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) 由 Ben Wang and Aran Komatsuzaki 发布。
 1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
 1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
+1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) 由 Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou 发布。
+1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) 由 Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei 发布。
 1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
 1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
 1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。
@@ -279,17 +286,30 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布
 1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
-1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
+1. **[PhoBERT](https://huggingface.co/transformers/model_doc/phobert.html)** (来自 VinAI Research) 伴随论文 [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) 由 Dat Quoc Nguyen and Anh Tuan Nguyen 发布。
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
+1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (来自 Google Research) 伴随论文 [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) 由 Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder 发布。
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
 1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
+1. **[SegFormer](https://huggingface.co/transformers/model_doc/segformer.html)** (来自 NVIDIA) 伴随论文 [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) 由 Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo 发布。
+1. **[SEW](https://huggingface.co/transformers/model_doc/sew.html)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
+1. **[SEW-D](https://huggingface.co/transformers/model_doc/sew_d.html)** (来自 ASAPP) 伴随论文 [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) 由 Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi 发布。
+1. **[SpeechEncoderDecoder](https://huggingface.co/transformers/model_doc/speechencoderdecoder.html)** 
 1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (来自 Facebook), 伴随论文 [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino 发布。
-1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
+1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/model_doc/speech_to_text_2.html)** (来自 Facebook) 伴随论文 [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) 由 Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau 发布。
+1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (来自 Tel Aviv University) 伴随论文 [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) 由 Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy 发布。
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (来自 Berkeley) 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
 1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
+1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (来自 Google AI) 伴随论文 [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
 1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
 1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
+1. **[TrOCR](https://huggingface.co/transformers/model_doc/trocr.html)** (来自 Microsoft) 伴随论文 [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) 由 Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei 发布。
+1. **[UniSpeech](https://huggingface.co/transformers/model_doc/unispeech.html)** (来自 Microsoft Research) 伴随论文 [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) 由 Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang 发布。
+1. **[UniSpeechSat](https://huggingface.co/transformers/model_doc/unispeech_sat.html)** (来自 Microsoft Research) 伴随论文 [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) 由 Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu 发布。
 1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
+1. **[VisionEncoderDecoder](https://huggingface.co/transformers/model_doc/visionencoderdecoder.html)** 
 1. **[VisualBERT](https://huggingface.co/transformers/model_doc/visual_bert.html)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
 1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
 1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -79,7 +79,8 @@ user: 使用者
    <p>
        <a href="https://github.com/huggingface/transformers/">English</a> |
        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hans.md">简体中文</a> |
-        <b>繁體中文</b>
+        <b>繁體中文</b> |
+        <a href="https://github.com/huggingface/transformers/blob/master/README_ko.md">한국어</a>
    <p>
 </h4>

@@ -247,10 +248,13 @@ conda install -c huggingface transformers
 1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
 1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
 1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BARTpho](https://huggingface.co/transformers/model_doc/bartpho.html)** (from VinAI Research) released with the paper [BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese](https://arxiv.org/abs/2109.09701) by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+1. **[BEiT](https://huggingface.co/transformers/model_doc/beit.html)** (from Microsoft) released with the paper [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254) by Hangbo Bao, Li Dong, Furu Wei.
 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BERTweet](https://huggingface.co/transformers/model_doc/bertweet.html)** (from VinAI Research) released with the paper [BERTweet: A pre-trained language model for English Tweets](https://aclanthology.org/2020.emnlp-demos.2/) by Dat Quoc Nguyen, Thanh Vu and Anh Tuan Nguyen.
 1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
 1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
 1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
@@ -267,23 +271,26 @@ conda install -c huggingface transformers
 1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
 1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
 1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
-1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
-for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
-Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[EncoderDecoder](https://huggingface.co/transformers/model_doc/encoderdecoder.html)** (from Google Research) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[FNet](https://huggingface.co/transformers/model_doc/fnet.html)** (from Google Research) released with the paper [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824) by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon.
 1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
 1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
 1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT-J](https://huggingface.co/transformers/model_doc/gptj.html)** (from EleutherAI) released with the paper [kingoflolz/mesh-transformer-jax](https://github.com/kingoflolz/mesh-transformer-jax/) by Ben Wang and Aran Komatsuzaki.
 1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
+1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
 1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LayoutLMv2](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding](https://arxiv.org/abs/2012.14740) by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou.
+1. **[LayoutXLM](https://huggingface.co/transformers/model_doc/layoutlmv2.html)** (from Microsoft Research Asia) released with the paper [LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding](https://arxiv.org/abs/2104.08836) by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
 1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
 1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
 1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
 1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
@@ -291,17 +298,30 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
 1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
 1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777) by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[PhoBERT](https://huggingface.co/transformers/model_doc/phobert.html)** (from VinAI Research) released with the paper [PhoBERT: Pre-trained language models for Vietnamese](https://www.aclweb.org/anthology/2020.findings-emnlp.92/) by Dat Quoc Nguyen and Anh Tuan Nguyen.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RemBERT](https://huggingface.co/transformers/model_doc/rembert.html)** (from Google Research) released with the paper [Rethinking embedding coupling in pre-trained language models](https://arxiv.org/pdf/2010.12821.pdf) by Hyung Won Chung, Thibault Févry, Henry Tsai, M. Johnson, Sebastian Ruder.
 1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SegFormer](https://huggingface.co/transformers/model_doc/segformer.html)** (from NVIDIA) released with the paper [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+1. **[SEW](https://huggingface.co/transformers/model_doc/sew.html)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SEW-D](https://huggingface.co/transformers/model_doc/sew_d.html)** (from ASAPP) released with the paper [Performance-Efficiency Trade-offs in Unsupervised Pre-training for Speech Recognition](https://arxiv.org/abs/2109.06870) by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+1. **[SpeechEncoderDecoder](https://huggingface.co/transformers/model_doc/speechencoderdecoder.html)** 
 1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[SpeechToTextTransformer2](https://huggingface.co/transformers/model_doc/speech_to_text_2.html)** (from Facebook) released with the paper [Large-Scale Self- and Semi-Supervised Learning for Speech Translation](https://arxiv.org/abs/2104.06678) by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+1. **[Splinter](https://huggingface.co/transformers/model_doc/splinter.html)** (from Tel Aviv University) released with the paper [Few-Shot Question Answering by Pretraining Span Selection](https://arxiv.org/abs/2101.00438) by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy.
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** (from Berkeley) released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[T5v1.1](https://huggingface.co/transformers/model_doc/t5v1.1.html)** (from Google AI) released with the paper [google-research/text-to-text-transfer-transformer](https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
 1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[TrOCR](https://huggingface.co/transformers/model_doc/trocr.html)** (from Microsoft) released with the paper [TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models](https://arxiv.org/abs/2109.10282) by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+1. **[UniSpeech](https://huggingface.co/transformers/model_doc/unispeech.html)** (from Microsoft Research) released with the paper [UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data](https://arxiv.org/abs/2101.07597) by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+1. **[UniSpeechSat](https://huggingface.co/transformers/model_doc/unispeech_sat.html)** (from Microsoft Research) released with the paper [UNISPEECH-SAT: UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING](https://arxiv.org/abs/2110.05752) by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu.
 1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VisionEncoderDecoder](https://huggingface.co/transformers/model_doc/visionencoderdecoder.html)** 
 1. **[VisualBERT](https://huggingface.co/transformers/model_doc/visual_bert.html)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
 1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
 1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
--- a/docs/README.md
+++ b/docs/README.md
@@ -166,7 +166,7 @@ Values that should be put in `code` should either be surrounded by double backti
 an object using the :obj: syntax: :obj:\`like so\`. Note that argument names and objects like True, None or any strings
 should usually be put in `code`.

-When mentionning a class, it is recommended to use the :class: syntax as the mentioned class will be automatically
+When mentioning a class, it is recommended to use the :class: syntax as the mentioned class will be automatically
 linked by Sphinx: :class:\`~transformers.XXXClass\`

 When mentioning a function, it is recommended to use the :func: syntax as the mentioned function will be automatically
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,10 +1,13 @@
 // These two things need to be updated at each release for the version selector.
 // Last stable version
-const stableVersion = "v4.8.2"
+const stableVersion = "v4.11.3"
 // Dictionary doc folder to label. The last stable version should have an empty key.
 const versionMapping = {
    "master": "master",
-    "": "v4.8.0/v4.8.1/v4.8.2 (stable)",
+    "": "v4.11.0/v4.11.1/v4.11.2/v4.11.3 (stable)",
+    "v4.10.1": "v4.10.0/v4.10.1",
+    "v4.9.2": "v4.9.0/v4.9.1/v4.9.2",
+    "v4.8.2": "v4.8.0/v4.8.1/v4.8.2",
    "v4.7.0": "v4.7.0",
    "v4.6.0": "v4.6.0",
    "v4.5.1": "v4.5.0/v4.5.1",
--- a/docs/source/add_new_model.rst
+++ b/docs/source/add_new_model.rst
@@ -76,7 +76,7 @@ Let's take a look:

 As you can see, we do make use of inheritance in 🤗 Transformers, but we keep the level of abstraction to an absolute
 minimum. There are never more than two levels of abstraction for any model in the library. :obj:`BrandNewBertModel`
-inherits from :obj:`BrandNewBertPreTrainedModel` which in turn inherits from :class:`~transformres.PreTrainedModel` and
+inherits from :obj:`BrandNewBertPreTrainedModel` which in turn inherits from :class:`~transformers.PreTrainedModel` and
 that's it. As a general rule, we want to make sure that a new model only depends on
 :class:`~transformers.PreTrainedModel`. The important functionalities that are automatically provided to every new
 model are :meth:`~transformers.PreTrainedModel.from_pretrained` and
@@ -271,7 +271,7 @@ logical components from one another and to have faster debugging cycles as inter
 notebooks are often easier to share with other contributors, which might be very helpful if you want to ask the Hugging
 Face team for help. If you are familiar with Jupiter notebooks, we strongly recommend you to work with them.

-The obvious disadvantage of Jupyther notebooks is that if you are not used to working with them you will have to spend
+The obvious disadvantage of Jupyter notebooks is that if you are not used to working with them you will have to spend
 some time adjusting to the new programming environment and that you might not be able to use your known debugging tools
 anymore, like ``ipdb``.

@@ -674,7 +674,7 @@ the ``input_ids`` (usually the word embeddings) are identical. And then work you
 network. At some point, you will notice a difference between the two implementations, which should point you to the bug
 in the 🤗 Transformers implementation. From our experience, a simple and efficient way is to add many print statements
 in both the original implementation and 🤗 Transformers implementation, at the same positions in the network
-respectively, and to successively remove print statements showing the same values for intermediate presentions.
+respectively, and to successively remove print statements showing the same values for intermediate presentations.

 When you're confident that both implementations yield the same output, verifying the outputs with
 ``torch.allclose(original_output, output, atol=1e-3)``, you're done with the most difficult part! Congratulations - the
--- a/docs/source/add_new_pipeline.rst
+++ b/docs/source/add_new_pipeline.rst
@@ -0,0 +1,143 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+
+How to add a pipeline to 🤗 Transformers?
+=======================================================================================================================
+
+First and foremost, you need to decide the raw entries the pipeline will be able to take. It can be strings, raw bytes,
+dictionaries or whatever seems to be the most likely desired input. Try to keep these inputs as pure Python as possible
+as it makes compatibility easier (even through other languages via JSON). Those will be the :obj:`inputs` of the
+pipeline (:obj:`preprocess`).
+
+Then define the :obj:`outputs`. Same policy as the :obj:`inputs`. The simpler, the better. Those will be the outputs of
+:obj:`postprocess` method.
+
+Start by inheriting the base class :obj:`Pipeline`. with the 4 methods needed to implement :obj:`preprocess`,
+:obj:`_forward`, :obj:`postprocess` and :obj:`_sanitize_parameters`.
+
+
+.. code-block::
+
+    from transformers import Pipeline
+
+    class MyPipeline(Pipeline):
+        def _sanitize_parameters(self, **kwargs)
+            preprocess_kwargs = {}
+            if "maybe_arg" in kwargs:
+                preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+            return preprocess_kwargs, {}, {}
+
+        def preprocess(self, inputs, maybe_arg=2)
+            model_input = Tensor(....)
+            return {"model_input": model_input}
+
+        def _forward(self, model_inputs)
+            # model_inputs == {"model_input": model_input}
+            oututs = self.model(**model_inputs)
+            # Maybe {"logits": Tensor(...)}
+            return outputs
+
+        def postprocess(self, model_outputs)
+            best_class = model_outputs["logits"].softmax(-1)
+            return best_class
+
+
+The structure of this breakdown is to support relatively seamless support for CPU/GPU, while supporting doing
+pre/postprocessing on the CPU on different threads
+
+:obj:`preprocess` will take the originally defined inputs, and turn them into something feedable to the model. It might
+contain more information and is usually a :obj:`Dict`.
+
+:obj:`_forward` is the implementation detail and is not meant to be called directly. :obj:`forward` is the preferred
+called method as it contains safeguards to make sure everything is working on the expected device. If anything is
+linked to a real model it belongs in the :obj:`_forward` method, anything else is in the preprocess/postprocess.
+
+:obj:`postprocess` methods will take the output of :obj:`_forward` and turn it into the final output that were decided
+earlier.
+
+:obj:`_sanitize_parameters` exists to allow users to pass any parameters whenever they wish, be it at initialization
+time ``pipeline(...., maybe_arg=4)`` or at call time ``pipe = pipeline(...); output = pipe(...., maybe_arg=4)``.
+
+The returns of :obj:`_sanitize_parameters` are the 3 dicts of kwargs that will be passed directly to :obj:`preprocess`,
+:obj:`_forward` and :obj:`postprocess`. Don't fill anything if the caller didn't call with any extra parameter. That
+allows to keep the default arguments in the function definition which is always more "natural".
+
+A classic example would be a :obj:`top_k` argument in the post processing in classification tasks.
+
+.. code-block::
+
+    >>> pipe = pipeline("my-new-task")
+    >>> pipe("This is a test")
+    [{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}, {"label": "3-star", "score": 0.05}
+    {"label": "4-star", "score": 0.025}, {"label": "5-star", "score": 0.025}]
+
+    >>> pipe("This is a test", top_k=2)
+    [{"label": "1-star", "score": 0.8}, {"label": "2-star", "score": 0.1}]
+
+In order to achieve that, we'll update our :obj:`postprocess` method with a default parameter to :obj:`5`. and edit
+:obj:`_sanitize_parameters` to allow this new parameter.
+
+
+.. code-block::
+
+
+        def postprocess(self, model_outputs, top_k=5)
+            best_class = model_outputs["logits"].softmax(-1)
+            # Add logic to handle top_k
+            return best_class
+
+        def _sanitize_parameters(self, **kwargs)
+            preprocess_kwargs = {}
+            if "maybe_arg" in kwargs:
+                preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"]
+
+            postprocess_kwargs = {}
+            if "top_k" in kwargs:
+                preprocess_kwargs["top_k"] = kwargs["top_k"]
+            return preprocess_kwargs, {}, postprocess_kwargs
+
+Try to keep the inputs/outputs very simple and ideally JSON-serializable as it makes the pipeline usage very easy
+without requiring users to understand new kind of objects. It's also relatively common to support many different types
+of arguments for ease of use (audio files, can be filenames, URLs or pure bytes)
+
+
+
+Adding it to the list of supported tasks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Go to ``src/transformers/pipelines/__init__.py`` and fill in :obj:`SUPPORTED_TASKS` with your newly created pipeline.
+If possible it should provide a default model.
+
+Adding tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Create a new file ``tests/test_pipelines_MY_PIPELINE.py`` with example with the other tests.
+
+The :obj:`run_pipeline_test` function will be very generic and run on small random models on every possible
+architecture as defined by :obj:`model_mapping` and :obj:`tf_model_mapping`.
+
+This is very important to test future compatibility, meaning if someone adds a new model for
+:obj:`XXXForQuestionAnswering` then the pipeline test will attempt to run on it. Because the models are random it's
+impossible to check for actual values, that's why There is a helper :obj:`ANY` that will simply attempt to match the
+output of the pipeline TYPE.
+
+You also *need* to implement 2 (ideally 4) tests.
+
+- :obj:`test_small_model_pt` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
+  and test the pipeline outputs. The results should be the same as :obj:`test_small_model_tf`.
+- :obj:`test_small_model_tf` : Define 1 small model for this pipeline (doesn't matter if the results don't make sense)
+  and test the pipeline outputs. The results should be the same as :obj:`test_small_model_pt`.
+- :obj:`test_large_model_pt` (:obj:`optional`): Tests the pipeline on a real pipeline where the results are supposed to
+  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
+  sure there is no drift in future releases
+- :obj:`test_large_model_tf` (:obj:`optional`): Tests the pipeline on a real pipeline where the results are supposed to
+  make sense. These tests are slow and should be marked as such. Here the goal is to showcase the pipeline and to make
+  sure there is no drift in future releases
--- a/docs/source/community.md
+++ b/docs/source/community.md
@@ -1,4 +1,4 @@
-# Community
+# Community

 This page regroups resources around 🤗 Transformers developed by the community.

@@ -12,6 +12,7 @@ This page regroups resources around 🤗 Transformers developed by the community

 | Notebook     |      Description      |      Author      |      |
 |:----------|:-------------|:-------------|------:|
+| [Fine-tune a pre-trained Transformer to generate lyrics](https://github.com/AlekseyKorshuk/huggingartists) | How to generate lyrics in the style of your favorite artist by fine-tuning a GPT-2 model |  [Aleksey Korshuk](https://github.com/AlekseyKorshuk) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/AlekseyKorshuk/huggingartists/blob/master/huggingartists-demo.ipynb) |
 | [Train T5 in Tensorflow 2 ](https://github.com/snapthat/TF-T5-text-to-text) | How to train T5 for any task using Tensorflow 2. This notebook demonstrates a Question & Answer task implemented in Tensorflow 2 using SQUAD | [Muhammad Harris](https://github.com/HarrisDePerceptron) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/snapthat/TF-T5-text-to-text/blob/master/snapthatT5/notebooks/TF-T5-Datasets%20Training.ipynb) |
 | [Train T5 on TPU](https://github.com/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb)  | How to train T5 on SQUAD with Transformers and Nlp | [Suraj Patil](https://github.com/patil-suraj) |[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/T5_on_TPU.ipynb#scrollTo=QLGiFCDqvuil) |
 | [Fine-tune T5 for Classification and Multiple Choice](https://github.com/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb)  | How to fine-tune T5 for classification and multiple choice tasks using a text-to-text format with PyTorch Lightning |  [Suraj Patil](https://github.com/patil-suraj) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patil-suraj/exploring-T5/blob/master/t5_fine_tuning.ipynb) |
@@ -35,7 +36,7 @@ This page regroups resources around 🤗 Transformers developed by the community
 |[fine-tune a non-English GPT-2 Model with Trainer class](https://github.com/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb) | How to fine-tune a non-English GPT-2 Model with Trainer class | [Philipp Schmid](https://www.philschmid.de) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/philschmid/fine-tune-GPT-2/blob/master/Fine_tune_a_non_English_GPT_2_Model_with_Huggingface.ipynb)|
 |[Fine-tune a DistilBERT Model for Multi Label Classification task](https://github.com/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb) | How to fine-tune a DistilBERT Model for Multi Label Classification task | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/Transformers_scripts/blob/master/Transformers_multilabel_distilbert.ipynb)|
 |[Fine-tune ALBERT for sentence-pair classification](https://github.com/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb) | How to fine-tune an ALBERT model or another BERT-based model for the sentence-pair classification task | [Nadir El Manouzi](https://github.com/NadirEM) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/NadirEM/nlp-notebooks/blob/master/Fine_tune_ALBERT_sentence_pair_classification.ipynb)|
-|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | How to fine-tune an Roberta model for sentiment analysis | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
+|[Fine-tune Roberta for sentiment analysis](https://github.com/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb) | How to fine-tune a Roberta model for sentiment analysis | [Dhaval Taunk](https://github.com/DhavalTaunk08) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/DhavalTaunk08/NLP_scripts/blob/master/sentiment_analysis_using_roberta.ipynb)|
 |[Evaluating Question Generation Models](https://github.com/flexudy-pipe/qugeev) | How accurate are the answers to questions generated by your seq2seq transformer model? | [Pascal Zoleko](https://github.com/zolekode) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1bpsSqCQU-iw_5nNoRm_crPq6FRuJthq_?usp=sharing)|
 |[Classify text with DistilBERT and Tensorflow](https://github.com/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb) | How to fine-tune DistilBERT for text classification in TensorFlow | [Peter Bayerle](https://github.com/peterbayerle) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/peterbayerle/huggingface_notebook/blob/main/distilbert_tf.ipynb)|
 |[Leverage BERT for Encoder-Decoder Summarization on CNN/Dailymail](https://github.com/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb) | How to warm-start a *EncoderDecoderModel* with a *bert-base-uncased* checkpoint for summarization on CNN/Dailymail | [Patrick von Platen](https://github.com/patrickvonplaten) | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/patrickvonplaten/notebooks/blob/master/BERT2BERT_for_CNN_Dailymail.ipynb)|
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -27,7 +27,14 @@ author = "huggingface"
 # The short X.Y version
 version = ""
 # The full version, including alpha/beta/rc tags
-release = u'4.7.0'
+release = "4.12.2"
+
+
+
+
+
+
+



@@ -208,6 +215,9 @@ epub_title = project
 # A list of files that should not be packed into the epub file.
 epub_exclude_files = ["search.html"]

+# Localization
+locale_dirs = ['locale/']
+gettext_compact = False

 def setup(app):
    app.add_css_file("css/huggingface.css")
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -13,8 +13,8 @@
 Converting Tensorflow Checkpoints
 =======================================================================================================================

-A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models
-than be loaded using the ``from_pretrained`` methods of the library.
+A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints to models
+that can be loaded using the ``from_pretrained`` methods of the library.

 .. note::
    Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**) available in any
--- a/docs/source/custom_datasets.rst
+++ b/docs/source/custom_datasets.rst
@@ -17,7 +17,7 @@ Fine-tuning with custom datasets

    The datasets used in this tutorial are available and can be more easily accessed using the `🤗 Datasets library
    <https://github.com/huggingface/datasets>`_. We do not use this library to access the datasets here since this
-    tutorial meant to illustrate how to work with your own data. A brief of introduction can be found at the end of the
+    tutorial meant to illustrate how to work with your own data. A brief introduction can be found at the end of the
    tutorial in the section ":ref:`datasetslib`".

 This tutorial will take you through several examples of using 🤗 Transformers models with your own datasets. The guide
@@ -74,8 +74,8 @@ read this in.
    train_texts, train_labels = read_imdb_split('aclImdb/train')
    test_texts, test_labels = read_imdb_split('aclImdb/test')

-We now have a train and test dataset, but let's also also create a validation set which we can use for for evaluation
-and tuning without tainting our test set results. Sklearn has a convenient utility for creating such splits:
+We now have a train and test dataset, but let's also create a validation set which we can use for for evaluation and
+tuning without tainting our test set results. Sklearn has a convenient utility for creating such splits:

 .. code-block:: python

@@ -91,8 +91,8 @@ pre-trained DistilBert, so let's use the DistilBert tokenizer.
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

 Now we can simply pass our texts to the tokenizer. We'll pass ``truncation=True`` and ``padding=True``, which will
-ensure that all of our sequences are padded to the same length and are truncated to be no longer model's maximum input
-length. This will allow us to feed batches of sequences into the model at the same time.
+ensure that all of our sequences are padded to the same length and are truncated to be no longer than model's maximum
+input length. This will allow us to feed batches of sequences into the model at the same time.

 .. code-block:: python

@@ -143,7 +143,7 @@ can be easily batched such that each key in the batch encoding corresponds to a
        test_labels
    ))

-Now that our datasets our ready, we can fine-tune a model either with the 🤗
+Now that our datasets are ready, we can fine-tune a model either with the 🤗
 :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` or with native PyTorch/TensorFlow. See :doc:`training
 <training>`.

@@ -213,7 +213,7 @@ instantiate a :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`.
 Fine-tuning with native PyTorch/TensorFlow
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-We can also train use native PyTorch or TensorFlow:
+We can also train using native PyTorch or TensorFlow:

 .. code-block:: python

--- a/docs/source/debugging.rst
+++ b/docs/source/debugging.rst
@@ -154,7 +154,7 @@ input elements was ``6.27e+04`` and same for the output was ``inf``.
 You can see here, that ``T5DenseGatedGeluDense.forward`` resulted in output activations, whose absolute max value was
 around 62.7K, which is very close to fp16's top limit of 64K. In the next frame we have ``Dropout`` which renormalizes
 the weights, after it zeroed some of the elements, which pushes the absolute max value to more than 64K, and we get an
-overlow (``inf``).
+overflow (``inf``).

 As you can see it's the previous frames that we need to look into when the numbers start going into very large for fp16
 numbers.
--- a/docs/source/imgs/parallelism-tp-parallel_gemm.png
+++ b/docs/source/imgs/parallelism-tp-parallel_gemm.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -105,187 +105,247 @@ Supported models
 3. :doc:`BARThez <model_doc/barthez>` (from École polytechnique) released with the paper `BARThez: a Skilled Pretrained
   French Sequence-to-Sequence Model <https://arxiv.org/abs/2010.12321>`__ by Moussa Kamal Eddine, Antoine J.-P.
   Tixier, Michalis Vazirgiannis.
-4. :doc:`BERT <model_doc/bert>` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional
+4. :doc:`BARTpho <model_doc/bartpho>` (from VinAI Research) released with the paper `BARTpho: Pre-trained
+   Sequence-to-Sequence Models for Vietnamese <https://arxiv.org/abs/2109.09701>`__ by Nguyen Luong Tran, Duong Minh Le
+   and Dat Quoc Nguyen.
+5. :doc:`BEiT <model_doc/beit>` (from Microsoft) released with the paper `BEiT: BERT Pre-Training of Image Transformers
+   <https://arxiv.org/abs/2106.08254>`__ by Hangbo Bao, Li Dong, Furu Wei.
+6. :doc:`BERT <model_doc/bert>` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional
   Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__ by Jacob Devlin, Ming-Wei Chang,
   Kenton Lee and Kristina Toutanova.
-5. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
+7. :doc:`BERTweet <model_doc/bertweet>` (from VinAI Research) released with the paper `BERTweet: A pre-trained language
+   model for English Tweets <https://aclanthology.org/2020.emnlp-demos.2/>`__ by Dat Quoc Nguyen, Thanh Vu and Anh Tuan
+   Nguyen.
+8. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
   Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
   Narayan, Aliaksei Severyn.
-6. :doc:`BigBird-RoBERTa <model_doc/bigbird>` (from Google Research) released with the paper `Big Bird: Transformers
+9. :doc:`BigBird-RoBERTa <model_doc/bigbird>` (from Google Research) released with the paper `Big Bird: Transformers
   for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua
   Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-7. :doc:`BigBird-Pegasus <model_doc/bigbird_pegasus>` (from Google Research) released with the paper `Big Bird:
-   Transformers for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by Manzil Zaheer, Guru Guruganesh, Avinava
-   Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
-8. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
-   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
-   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-9. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building an
-   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
-   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-10. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
+10. :doc:`BigBird-Pegasus <model_doc/bigbird_pegasus>` (from Google Research) released with the paper `Big Bird:
+    Transformers for Longer Sequences <https://arxiv.org/abs/2007.14062>`__ by Manzil Zaheer, Guru Guruganesh, Avinava
+    Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr
+    Ahmed.
+11. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
+    open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
+    Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+12. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building
+    an open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju,
+    Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+13. :doc:`BORT <model_doc/bort>` (from Alexa) released with the paper `Optimal Subarchitecture Extraction For BERT
    <https://arxiv.org/abs/2010.10499>`__ by Adrian de Wynter and Daniel J. Perry.
-11. :doc:`ByT5 <model_doc/byt5>` (from Google Research) released with the paper `ByT5: Towards a token-free future with
+14. :doc:`ByT5 <model_doc/byt5>` (from Google Research) released with the paper `ByT5: Towards a token-free future with
    pre-trained byte-to-byte models <https://arxiv.org/abs/2105.13626>`__ by Linting Xue, Aditya Barua, Noah Constant,
    Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
-12. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
+15. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
    French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
    Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-13. :doc:`CANINE <model_doc/canine>` (from Google Research) released with the paper `CANINE: Pre-training an Efficient
+16. :doc:`CANINE <model_doc/canine>` (from Google Research) released with the paper `CANINE: Pre-training an Efficient
    Tokenization-Free Encoder for Language Representation <https://arxiv.org/abs/2103.06874>`__ by Jonathan H. Clark,
    Dan Garrette, Iulia Turc, John Wieting.
-14. :doc:`CLIP <model_doc/clip>` (from OpenAI) released with the paper `Learning Transferable Visual Models From
+17. :doc:`CLIP <model_doc/clip>` (from OpenAI) released with the paper `Learning Transferable Visual Models From
    Natural Language Supervision <https://arxiv.org/abs/2103.00020>`__ by Alec Radford, Jong Wook Kim, Chris Hallacy,
    Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen
    Krueger, Ilya Sutskever.
-15. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
+18. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
    Span-based Dynamic Convolution <https://arxiv.org/abs/2008.02496>`__ by Zihang Jiang, Weihao Yu, Daquan Zhou,
    Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-16. :doc:`CPM <model_doc/cpm>` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative
+19. :doc:`CPM <model_doc/cpm>` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative
    Chinese Pre-trained Language Model <https://arxiv.org/abs/2012.00413>`__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei
    Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng,
    Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang,
    Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-17. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
+20. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
    Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
    Lav R. Varshney, Caiming Xiong and Richard Socher.
-18. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
+21. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
    Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu
    Chen.
-19. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
+22. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
    with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
    Weizhu Chen.
-20. :doc:`DeiT <model_doc/deit>` (from Facebook) released with the paper `Training data-efficient image transformers &
+23. :doc:`DeiT <model_doc/deit>` (from Facebook) released with the paper `Training data-efficient image transformers &
    distillation through attention <https://arxiv.org/abs/2012.12877>`__ by Hugo Touvron, Matthieu Cord, Matthijs
    Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-21. :doc:`DETR <model_doc/detr>` (from Facebook) released with the paper `End-to-End Object Detection with Transformers
+24. :doc:`DETR <model_doc/detr>` (from Facebook) released with the paper `End-to-End Object Detection with Transformers
    <https://arxiv.org/abs/2005.12872>`__ by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier,
    Alexander Kirillov, Sergey Zagoruyko.
-22. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+25. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
    Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
    Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-23. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+26. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
    distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
    Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
    `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
    version of DistilBERT.
-24. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+27. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
    Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
    Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-25. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+28. :doc:`EncoderDecoder <model_doc/encoderdecoder>` (from Google Research) released with the paper `Leveraging
+    Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
+    Narayan, Aliaksei Severyn.
+29. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
    Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
    Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-26. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+30. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
    Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
    Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-27. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+31. :doc:`FNet <model_doc/fnet>` (from Google Research) released with the paper `FNet: Mixing Tokens with Fourier
+    Transforms <https://arxiv.org/abs/2105.03824>`__ by James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago
+    Ontanon.
+32. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
    Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
    Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-28. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+33. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
    Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
    and Ilya Sutskever.
-29. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+34. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
    Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
    Luan, Dario Amodei** and Ilya Sutskever**.
-30. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
+35. :doc:`GPT-J <model_doc/gptj>` (from EleutherAI) released in the repository `kingoflolz/mesh-transformer-jax
+    <https://github.com/kingoflolz/mesh-transformer-jax/>`__ by Ben Wang and Aran Komatsuzaki.
+36. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
    <https://github.com/EleutherAI/gpt-neo>`__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-31. :doc:`Hubert <model_doc/hubert>` (from Facebook) released with the paper `HuBERT: Self-Supervised Speech
+37. :doc:`Hubert <model_doc/hubert>` (from Facebook) released with the paper `HuBERT: Self-Supervised Speech
    Representation Learning by Masked Prediction of Hidden Units <https://arxiv.org/abs/2106.07447>`__ by Wei-Ning Hsu,
    Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-32. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
-    <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
-33. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+38. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
+    <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer.
+39. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-34. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+40. :doc:`LayoutLMv2 <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutLMv2:
+    Multi-modal Pre-training for Visually-Rich Document Understanding <https://arxiv.org/abs/2012.14740>`__ by Yang Xu,
+    Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Wanxiang Che, Min
+    Zhang, Lidong Zhou.
+41. :doc:`LayoutXLM <model_doc/layoutlmv2>` (from Microsoft Research Asia) released with the paper `LayoutXLM:
+    Multimodal Pre-training for Multilingual Visually-rich Document Understanding <https://arxiv.org/abs/2104.08836>`__
+    by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha Zhang, Furu Wei.
+42. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-35. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+43. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-36. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
+44. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
    Representations with Entity-aware Self-attention <https://arxiv.org/abs/2010.01057>`__ by Ikuya Yamada, Akari Asai,
    Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-37. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+45. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
    by Hao Tan and Mohit Bansal.
-38. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
-    Machine Translation <https://arxiv.org/abs/2010.11125>`__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi
-    Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman
-    Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-39. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+46. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
+    Machine Translation <https://arxiv.org/abs/2010.11125>`__ by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma,
+    Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal,
+    Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+47. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
    Translator Team.
-40. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+48. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-41. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
+49. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
    Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
    Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-42. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
+50. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-43. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
+51. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-44. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+52. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
    Jianfeng Lu, Tie-Yan Liu.
-45. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+53. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-46. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
-    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
+54. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__ by Jingqing Zhang, Yao Zhao,
    Mohammad Saleh and Peter J. Liu.
-47. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+55. :doc:`PhoBERT <model_doc/phobert>` (from VinAI Research) released with the paper `PhoBERT: Pre-trained language
+    models for Vietnamese <https://www.aclweb.org/anthology/2020.findings-emnlp.92/>`__ by Dat Quoc Nguyen and Anh Tuan
+    Nguyen.
+56. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-48. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+57. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-49. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+58. :doc:`RemBERT <model_doc/rembert>` (from Google Research) released with the paper `Rethinking embedding coupling in
+    pre-trained language models <https://arxiv.org/pdf/2010.12821.pdf>`__ by Hyung Won Chung, Thibault Févry, Henry
+    Tsai, M. Johnson, Sebastian Ruder.
+59. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-50. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
+60. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
    Enhanced Transformer with Rotary Position Embedding <https://arxiv.org/pdf/2104.09864v1.pdf>`__ by Jianlin Su and
    Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-51. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
+61. :doc:`SegFormer <model_doc/segformer>` (from NVIDIA) released with the paper `SegFormer: Simple and Efficient
+    Design for Semantic Segmentation with Transformers <https://arxiv.org/abs/2105.15203>`__ by Enze Xie, Wenhai Wang,
+    Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping Luo.
+62. :doc:`SEW <model_doc/sew>` (from ASAPP) released with the paper `Performance-Efficiency Trade-offs in Unsupervised
+    Pre-training for Speech Recognition <https://arxiv.org/abs/2109.06870>`__ by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu
+    Han, Kilian Q. Weinberger, Yoav Artzi.
+63. :doc:`SEW-D <model_doc/sew_d>` (from ASAPP) released with the paper `Performance-Efficiency Trade-offs in
+    Unsupervised Pre-training for Speech Recognition <https://arxiv.org/abs/2109.06870>`__ by Felix Wu, Kwangyoun Kim,
+    Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+64. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
    `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
    Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-52. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
-    about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
-    Krishna, and Kurt W. Keutzer.
-53. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+65. :doc:`SpeechToTextTransformer2 <model_doc/speech_to_text_2>` (from Facebook), released together with the paper
+    `Large-Scale Self- and Semi-Supervised Learning for Speech Translation <https://arxiv.org/abs/2104.06678>`__ by
+    Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+66. :doc:`Splinter <model_doc/splinter>` (from Tel Aviv University), released together with the paper `Few-Shot
+    Question Answering by Pretraining Span Selection <https://arxiv.org/abs/2101.00438>`__ by Ori Ram, Yuval Kirstain,
+    Jonathan Berant, Amir Globerson, Omer Levy.
+67. :doc:`SqueezeBert <model_doc/squeezebert>` (from Berkeley) released with the paper `SqueezeBERT: What can computer
+    vision teach NLP about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola,
+    Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+68. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-54. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+69. :doc:`T5v1.1 <model_doc/t5v1.1>` (from Google AI) released in the repository
+    `google-research/text-to-text-transfer-transformer
+    <https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__ by
+    Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi
+    Zhou and Wei Li and Peter J. Liu.
+70. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
    Francesco Piccinno and Julian Martin Eisenschlos.
-55. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+71. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-56. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
+72. :doc:`TrOCR <model_doc/trocr>` (from Microsoft), released together with the paper `TrOCR: Transformer-based Optical
+    Character Recognition with Pre-trained Models <https://arxiv.org/abs/2109.10282>`__ by Minghao Li, Tengchao Lv, Lei
+    Cui, Yijuan Lu, Dinei Florencio, Cha Zhang, Zhoujun Li, Furu Wei.
+73. :doc:`UniSpeech <model_doc/unispeech>` (from Microsoft Research) released with the paper `UniSpeech: Unified Speech
+    Representation Learning with Labeled and Unlabeled Data <https://arxiv.org/abs/2101.07597>`__ by Chengyi Wang, Yu
+    Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael Zeng, Xuedong Huang.
+74. :doc:`UniSpeechSat <model_doc/unispeech_sat>` (from Microsoft Research) released with the paper `UNISPEECH-SAT:
+    UNIVERSAL SPEECH REPRESENTATION LEARNING WITH SPEAKER AWARE PRE-TRAINING <https://arxiv.org/abs/2110.05752>`__ by
+    Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen, Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li,
+    Xiangzhan Yu.
+75. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
    Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
    Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
    Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-57. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
+76. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
    Performant Baseline for Vision and Language <https://arxiv.org/pdf/1908.03557>`__ by Liunian Harold Li, Mark
    Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-58. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+77. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
    Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
    Zhou, Abdelrahman Mohamed, Michael Auli.
-59. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+78. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-60. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+79. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-61. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+80. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
    Zettlemoyer and Veselin Stoyanov.
-62. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+81. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-63. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
+82. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
    Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
    Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.

@@ -305,10 +365,12 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
 +=============================+================+================+=================+====================+==============+
-|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            BEiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
@@ -319,71 +381,81 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|       BlenderbotSmall       |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            CLIP             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|       BlenderbotSmall       |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           Canine            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            CLIP             |       ✅       |       ✅       |       ✅        |         ❌         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           DeBERTa           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+|         DeBERTa-v2          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            DeiT             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 | FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            FNet             |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            GPT-J            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         LayoutLMv2          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            LUKE             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           M2M100            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -391,14 +463,28 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           RemBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          RoFormer           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          SegFormer          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             SEW             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            SEW-D            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|   Speech Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         Speech2Text         |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        Speech2Text2         |       ✅       |       ❌       |       ❌        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          Splinter           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
@@ -407,10 +493,18 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             ViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+|            TrOCR            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          UniSpeech          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        UniSpeechSat         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|   Vision Encoder decoder    |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             ViT             |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
@@ -421,10 +515,6 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
-+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+

 .. toctree::
    :maxdepth: 2
@@ -462,6 +552,7 @@ Flax), PyTorch, and/or TensorFlow.
    migration
    contributing
    add_new_model
+    add_new_pipeline
    fast_tokenizers
    performance
    parallelism
@@ -484,6 +575,7 @@ Flax), PyTorch, and/or TensorFlow.
    main_classes/callback
    main_classes/configuration
    main_classes/data_collator
+    main_classes/keras_callbacks
    main_classes/logging
    main_classes/model
    main_classes/optimizer_schedules
@@ -503,6 +595,8 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/auto
    model_doc/bart
    model_doc/barthez
+    model_doc/bartpho
+    model_doc/beit
    model_doc/bert
    model_doc/bertweet
    model_doc/bertgeneration
@@ -529,11 +623,14 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/electra
    model_doc/encoderdecoder
    model_doc/flaubert
+    model_doc/fnet
    model_doc/fsmt
    model_doc/funnel
    model_doc/herbert
    model_doc/ibert
    model_doc/layoutlm
+    model_doc/layoutlmv2
+    model_doc/layoutxlm
    model_doc/led
    model_doc/longformer
    model_doc/luke
@@ -548,6 +645,7 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/mt5
    model_doc/gpt
    model_doc/gpt2
+    model_doc/gptj
    model_doc/gpt_neo
    model_doc/hubert
    model_doc/pegasus
@@ -555,14 +653,26 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/prophetnet
    model_doc/rag
    model_doc/reformer
+    model_doc/rembert
    model_doc/retribert
    model_doc/roberta
    model_doc/roformer
+    model_doc/segformer
+    model_doc/sew
+    model_doc/sew_d
+    model_doc/speechencoderdecoder
    model_doc/speech_to_text
+    model_doc/speech_to_text_2
+    model_doc/splinter
    model_doc/squeezebert
    model_doc/t5
+    model_doc/t5v1.1
    model_doc/tapas
    model_doc/transformerxl
+    model_doc/trocr
+    model_doc/unispeech
+    model_doc/unispeech_sat
+    model_doc/visionencoderdecoder
    model_doc/vit
    model_doc/visual_bert
    model_doc/wav2vec2
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -79,9 +79,9 @@ Here is how to quickly install `transformers` from source:
 pip install git+https://github.com/huggingface/transformers
 ```

-Note that this will install not the latest released version, but the bleeding edge `master` version, which you may want to use in case a bug has been fixed since the last official release and a new release hasn't  been yet rolled out.
+Note that this will install not the latest released version, but the bleeding edge `master` version, which you may want to use in case a bug has been fixed since the last official release and a new release hasn't been yet rolled out.

-While we strive to keep `master` operational at all times, if you notice some issues, they usually get fixed within a few hours or a day and and you're more than welcome to help us detect any problems by opening an [Issue](https://github.com/huggingface/transformers/issues) and this way, things will get fixed even sooner.
+While we strive to keep `master` operational at all times, if you notice some issues, they usually get fixed within a few hours or a day and you're more than welcome to help us detect any problems by opening an [Issue](https://github.com/huggingface/transformers/issues) and this way, things will get fixed even sooner.

 Again, you can run:

--- a/docs/source/internal/modeling_utils.rst
+++ b/docs/source/internal/modeling_utils.rst
@@ -63,7 +63,6 @@ TensorFlow custom layers
    :members: call

 .. autoclass:: transformers.modeling_tf_utils.TFSequenceSummary
-    :members: call


 TensorFlow loss functions
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@@ -17,6 +17,11 @@ The base class :class:`~transformers.PretrainedConfig` implements the common met
 either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
 from HuggingFace's AWS S3 repository).

+Each derived config class implements model specific attributes. Common attributes present in all config classes are:
+:obj:`hidden_size`, :obj:`num_attention_heads`, and :obj:`num_hidden_layers`. Text models further implement:
+:obj:`vocab_size`.
+
+

 PretrainedConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/main_classes/data_collator.rst
+++ b/docs/source/main_classes/data_collator.rst
@@ -18,7 +18,7 @@ the same type as the elements of :obj:`train_dataset` or :obj:`eval_dataset`.

 To be able to build batches, data collators may apply some processing (like padding). Some of them (like
 :class:`~transformers.DataCollatorForLanguageModeling`) also apply some random data augmentation (like random masking)
-oin the formed batch.
+on the formed batch.

 Examples of use can be found in the :doc:`example scripts <../examples>` or :doc:`example notebooks <../notebooks>`.

@@ -54,18 +54,18 @@ DataCollatorForLanguageModeling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.data.data_collator.DataCollatorForLanguageModeling
-    :members: mask_tokens
+    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens


 DataCollatorForWholeWordMask
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.data.data_collator.DataCollatorForWholeWordMask
-    :members: mask_tokens
+    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens


 DataCollatorForPermutationLanguageModeling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.data.data_collator.DataCollatorForPermutationLanguageModeling
-    :members: mask_tokens
+    :members: numpy_mask_tokens, tf_mask_tokens, torch_mask_tokens
--- a/docs/source/main_classes/deepspeed.rst
+++ b/docs/source/main_classes/deepspeed.rst
@@ -1728,7 +1728,7 @@ For example for a pretrained model:
 .. code-block:: python

    from transformers.deepspeed import HfDeepSpeedConfig
-    from transformers import AugoModel
+    from transformers import AutoModel, deepspeed

    ds_config = { ... } # deepspeed config object or path to the file
    # must run before instantiating the model
@@ -1741,7 +1741,7 @@ or for non-pretrained model:
 .. code-block:: python

    from transformers.deepspeed import HfDeepSpeedConfig
-    from transformers import AugoModel, AutoConfig
+    from transformers import AutoModel, AutoConfig, deepspeed

    ds_config = { ... } # deepspeed config object or path to the file
    # must run before instantiating the model
--- a/docs/source/main_classes/keras_callbacks.rst
+++ b/docs/source/main_classes/keras_callbacks.rst
@@ -0,0 +1,22 @@
+..
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Keras callbacks
+=======================================================================================================================
+
+When training a Transformers model with Keras, there are some library-specific callbacks available to automate common
+tasks:
+
+PushToHubCallback
+-----------------------------------------------------------------------------------------------------------------------
+
+.. autoclass:: transformers.keras_callbacks.PushToHubCallback
--- a/docs/source/main_classes/output.rst
+++ b/docs/source/main_classes/output.rst
@@ -210,6 +210,13 @@ TFBaseModelOutputWithPooling
    :members:


+TFBaseModelOutputWithPoolingAndCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFBaseModelOutputWithPoolingAndCrossAttentions
+    :members:
+
+
 TFBaseModelOutputWithPast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -217,6 +224,13 @@ TFBaseModelOutputWithPast
    :members:


+TFBaseModelOutputWithPastAndCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFBaseModelOutputWithPastAndCrossAttentions
+    :members:
+
+
 TFSeq2SeqModelOutput
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -231,6 +245,13 @@ TFCausalLMOutput
    :members:


+TFCausalLMOutputWithCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFCausalLMOutputWithCrossAttentions
+    :members:
+
+
 TFCausalLMOutputWithPast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -299,3 +320,93 @@ TFSeq2SeqQuestionAnsweringModelOutput

 .. autoclass:: transformers.modeling_tf_outputs.TFSeq2SeqQuestionAnsweringModelOutput
    :members:
+
+
+FlaxBaseModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutput
+
+
+FlaxBaseModelOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPast
+
+
+FlaxBaseModelOutputWithPooling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPooling
+
+
+FlaxBaseModelOutputWithPastAndCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxBaseModelOutputWithPastAndCrossAttentions
+
+
+FlaxSeq2SeqModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqModelOutput
+
+
+FlaxCausalLMOutputWithCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxCausalLMOutputWithCrossAttentions
+
+
+FlaxMaskedLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxMaskedLMOutput
+
+
+FlaxSeq2SeqLMOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqLMOutput
+
+
+FlaxNextSentencePredictorOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxNextSentencePredictorOutput
+
+
+FlaxSequenceClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSequenceClassifierOutput
+
+
+FlaxSeq2SeqSequenceClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqSequenceClassifierOutput
+
+
+FlaxMultipleChoiceModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxMultipleChoiceModelOutput
+
+
+FlaxTokenClassifierOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxTokenClassifierOutput
+
+
+FlaxQuestionAnsweringModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxQuestionAnsweringModelOutput
+
+
+FlaxSeq2SeqQuestionAnsweringModelOutput
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_flax_outputs.FlaxSeq2SeqQuestionAnsweringModelOutput
--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@@ -23,20 +23,23 @@ There are two categories of pipeline abstractions to be aware about:
 - The :func:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines.
 - The other task-specific pipelines:

+    - :class:`~transformers.AudioClassificationPipeline`
    - :class:`~transformers.AutomaticSpeechRecognitionPipeline`
    - :class:`~transformers.ConversationalPipeline`
    - :class:`~transformers.FeatureExtractionPipeline`
    - :class:`~transformers.FillMaskPipeline`
    - :class:`~transformers.ImageClassificationPipeline`
+    - :class:`~transformers.ImageSegmentationPipeline`
+    - :class:`~transformers.ObjectDetectionPipeline`
    - :class:`~transformers.QuestionAnsweringPipeline`
    - :class:`~transformers.SummarizationPipeline`
+    - :class:`~transformers.TableQuestionAnsweringPipeline`
    - :class:`~transformers.TextClassificationPipeline`
    - :class:`~transformers.TextGenerationPipeline`
+    - :class:`~transformers.Text2TextGenerationPipeline`
    - :class:`~transformers.TokenClassificationPipeline`
    - :class:`~transformers.TranslationPipeline`
    - :class:`~transformers.ZeroShotClassificationPipeline`
-    - :class:`~transformers.Text2TextGenerationPipeline`
-    - :class:`~transformers.TableQuestionAnsweringPipeline`

 The pipeline abstraction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -44,12 +47,60 @@ The pipeline abstraction
 The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any other
 pipeline but requires an additional argument which is the `task`.

+Simple call on one item:
+
+.. code-block::
+
+    >>> pipe = pipeline("text-classification")
+    >>> pipe("This restaurant is awesome")
+    [{'label': 'POSITIVE', 'score': 0.9998743534088135}]
+
+To call a pipeline on many items, you can either call with a `list`.
+
+.. code-block::
+
+    >>> pipe = pipeline("text-classification")
+    >>> pipe(["This restaurant is awesome", "This restaurant is aweful"])
+    [{'label': 'POSITIVE', 'score': 0.9998743534088135},
+     {'label': 'NEGATIVE', 'score': 0.9996669292449951}]
+
+
+To iterate of full datasets it is recommended to use a :obj:`dataset` directly. This means you don't need to allocate
+the whole dataset at once, nor do you need to do batching yourself. This should work just as fast as custom loops on
+GPU. If it doesn't don't hesitate to create an issue.
+
+.. code-block::
+
+    pipe = pipeline("automatic-speech-recognition", model="facebook/wav2vec2-base-960h", device=0)
+    dataset = datasets.load_dataset("superb", name="asr", split="test")
+
+    # KeyDataset (only `pt`) will simply return the item in the dict returned by the dataset item
+    # as we're not interested in the `target` part of the dataset.
+    for out in tqdm.tqdm(pipe(KeyDataset(dataset, "file"))):
+        print(out)
+        # {"text": "NUMBER TEN FRESH NELLY IS WAITING ON YOU GOOD NIGHT HUSBAND"}
+        # {"text": ....}
+        # ....
+
+
 .. autofunction:: transformers.pipeline

+Implementing a pipeline
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+:doc:`Implementing a new pipeline <../add_new_pipeline>`

 The task specific pipelines
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+
+AudioClassificationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.AudioClassificationPipeline
+    :special-members: __call__
+    :members:
+
 AutomaticSpeechRecognitionPipeline
 =======================================================================================================================

@@ -87,6 +138,13 @@ ImageClassificationPipeline
    :special-members: __call__
    :members:

+ImageSegmentationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.ImageSegmentationPipeline
+    :special-members: __call__
+    :members:
+
 NerPipeline
 =======================================================================================================================

@@ -94,6 +152,13 @@ NerPipeline

 See :class:`~transformers.TokenClassificationPipeline` for all details.

+ObjectDetectionPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.ObjectDetectionPipeline
+    :special-members: __call__
+    :members:
+
 QuestionAnsweringPipeline
 =======================================================================================================================

--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -64,9 +64,9 @@ classification:

    class MultilabelTrainer(Trainer):
        def compute_loss(self, model, inputs, return_outputs=False):
-            labels = inputs.pop("labels")
+            labels = inputs.get("labels")
            outputs = model(**inputs)
-            logits = outputs.logits
+            logits = outputs.get('logits')
            loss_fct = nn.BCEWithLogitsLoss()
            loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                            labels.float().view(-1, self.model.config.num_labels))
@@ -119,6 +119,29 @@ TFTrainingArguments
    :members:


+Checkpoints
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+By default, :class:`~transformers.Trainer` will save all checkpoints in the :obj:`output_dir` you set in the
+:class:`~transformers.TrainingArguments` you are using. Those will go in subfolder named :obj:`checkpoint-xxx` with xxx
+being the step at which the training was at.
+
+Resuming training from a checkpoint can be done when calling :meth:`~transformers.Trainer.train` with either:
+
+- :obj:`resume_from_checkpoint=True` which will resume training from the latest checkpoint
+- :obj:`resume_from_checkpoint=checkpoint_dir` which will resume training from the specific checkpoint in the directory
+  passed.
+
+In addition, you can easily save your checkpoints on the Model Hub when using :obj:`push_to_hub=True`. By default, all
+the models saved in intermediate checkpoints are saved in different commits, but not the optimizer state. You can adapt
+the :obj:`hub-strategy` value of your :class:`~transformers.TrainingArguments` to either:
+
+- :obj:`"checkpoint"`: the latest checkpoint is also pushed in a subfolder named last-checkpoint, allowing you to
+  resume training easily with :obj:`trainer.train(resume_from_checkpoint="output_dir/last-checkpoint")`.
+- :obj:`"all_checkpoints"`: all checkpoints are pushed like they appear in the output folder (so you will get one
+  checkpoint folder per folder in your final repository)
+
+
 Logging
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -197,7 +220,7 @@ which should make the "stop and resume" style of training as close as possible t
 However, due to various default non-deterministic pytorch settings this might not fully work. If you want full
 determinism please refer to `Controlling sources of randomness
 <https://pytorch.org/docs/stable/notes/randomness.html>`__. As explained in the document, that some of those settings
-that make things determinstic (.e.g., ``torch.backends.cudnn.deterministic``) may slow things down, therefore this
+that make things deterministic (.e.g., ``torch.backends.cudnn.deterministic``) may slow things down, therefore this
 can't be done by default, but you can enable those yourself if needed.


--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -43,7 +43,8 @@ Tips:
  similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
  number of (repeating) layers.

-This model was contributed by `lysandre <https://huggingface.co/lysandre>`__. The original code can be found `here
+This model was contributed by `lysandre <https://huggingface.co/lysandre>`__. This model jax version was contributed by
+`kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found `here
 <https://github.com/google-research/ALBERT>`__.

 AlbertConfig
@@ -174,3 +175,52 @@ TFAlbertForQuestionAnswering

 .. autoclass:: transformers.TFAlbertForQuestionAnswering
    :members: call
+
+
+FlaxAlbertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertModel
+    :members: __call__
+
+
+FlaxAlbertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForPreTraining
+    :members: __call__
+
+
+FlaxAlbertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForMaskedLM
+    :members: __call__
+
+
+FlaxAlbertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForSequenceClassification
+    :members: __call__
+
+
+FlaxAlbertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForMultipleChoice
+    :members: __call__
+
+
+FlaxAlbertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForTokenClassification
+    :members: __call__
+
+
+FlaxAlbertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAlbertForQuestionAnswering
+    :members: __call__
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -27,7 +27,32 @@ Instantiating one of :class:`~transformers.AutoConfig`, :class:`~transformers.Au

 will create a model that is an instance of :class:`~transformers.BertModel`.

-There is one class of :obj:`AutoModel` for each task, and for each backend (PyTorch or TensorFlow).
+There is one class of :obj:`AutoModel` for each task, and for each backend (PyTorch, TensorFlow, or Flax).
+
+Extending the Auto Classes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Each of the auto classes has a method to be extended with your custom classes. For instance, if you have defined a
+custom class of model :obj:`NewModel`, make sure you have a :obj:`NewModelConfig` then you can add those to the auto
+classes like this:
+
+.. code-block::
+
+    from transformers import AutoConfig, AutoModel
+
+    AutoConfig.register("new-model", NewModelConfig)
+    AutoModel.register(NewModelConfig, NewModel)
+
+You will then be able to use the auto classes like you would usually do!
+
+.. warning::
+
+    If your :obj:`NewModelConfig` is a subclass of :class:`~transformer.PretrainedConfig`, make sure its
+    :obj:`model_type` attribute is set to the same key you use when registering the config (here :obj:`"new-model"`).
+
+    Likewise, if your :obj:`NewModel` is a subclass of :class:`~transformers.PreTrainedModel`, make sure its
+    :obj:`config_class` attribute is set to the same class you use when registering the model (here
+    :obj:`NewModelConfig`).


 AutoConfig
@@ -135,6 +160,41 @@ AutoModelForImageClassification
    :members:


+AutoModelForAudioClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForAudioClassification
+    :members:
+
+
+AutoModelForCTC
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForCTC
+    :members:
+
+
+AutoModelForSpeechSeq2Seq
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForSpeechSeq2Seq
+    :members:
+
+
+AutoModelForObjectDetection
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForObjectDetection
+    :members:
+
+
+AutoModelForImageSegmentation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForImageSegmentation
+    :members:
+
+
 TFAutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/bartpho.rst
+++ b/docs/source/model_doc/bartpho.rst
@@ -0,0 +1,86 @@
+..
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BARTpho
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BARTpho model was proposed in `BARTpho: Pre-trained Sequence-to-Sequence Models for Vietnamese
+<https://arxiv.org/abs/2109.09701>`__ by Nguyen Luong Tran, Duong Minh Le and Dat Quoc Nguyen.
+
+The abstract from the paper is the following:
+
+*We present BARTpho with two versions -- BARTpho_word and BARTpho_syllable -- the first public large-scale monolingual
+sequence-to-sequence models pre-trained for Vietnamese. Our BARTpho uses the "large" architecture and pre-training
+scheme of the sequence-to-sequence denoising model BART, thus especially suitable for generative NLP tasks. Experiments
+on a downstream task of Vietnamese text summarization show that in both automatic and human evaluations, our BARTpho
+outperforms the strong baseline mBART and improves the state-of-the-art. We release BARTpho to facilitate future
+research and applications of generative Vietnamese NLP tasks.*
+
+Example of use:
+
+.. code-block::
+
+    >>> import torch
+    >>> from transformers import AutoModel, AutoTokenizer
+
+    >>> bartpho = AutoModel.from_pretrained("vinai/bartpho-syllable")
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-syllable")
+
+    >>> line = "Chúng tôi là những nghiên cứu viên."
+
+    >>> input_ids = tokenizer(line, return_tensors="pt")
+
+    >>> with torch.no_grad():
+    ...     features = bartpho(**input_ids)  # Models outputs are now tuples
+
+    >>> # With TensorFlow 2.0+:
+    >>> from transformers import TFAutoModel
+    >>> bartpho = TFAutoModel.from_pretrained("vinai/bartpho-syllable")
+    >>> input_ids = tokenizer(line, return_tensors="tf")
+    >>> features = bartpho(**input_ids)
+
+Tips:
+
+- Following mBART, BARTpho uses the "large" architecture of BART with an additional layer-normalization layer on top of
+  both the encoder and decoder. Thus, usage examples in the :doc:`documentation of BART <bart>`, when adapting to use
+  with BARTpho, should be adjusted by replacing the BART-specialized classes with the mBART-specialized counterparts.
+  For example:
+
+.. code-block::
+
+    >>> from transformers import MBartForConditionalGeneration
+    >>> bartpho = MBartForConditionalGeneration.from_pretrained("vinai/bartpho-syllable")
+    >>> TXT = 'Chúng tôi là <mask> nghiên cứu viên.'
+    >>> input_ids = tokenizer([TXT], return_tensors='pt')['input_ids']
+    >>> logits = bartpho(input_ids).logits
+    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+    >>> probs = logits[0, masked_index].softmax(dim=0)
+    >>> values, predictions = probs.topk(5)
+    >>> print(tokenizer.decode(predictions).split())
+
+- This implementation is only for tokenization: "monolingual_vocab_file" consists of Vietnamese-specialized types
+  extracted from the pre-trained SentencePiece model "vocab_file" that is available from the multilingual XLM-RoBERTa.
+  Other languages, if employing this pre-trained multilingual SentencePiece model "vocab_file" for subword
+  segmentation, can reuse BartphoTokenizer with their own language-specialized "monolingual_vocab_file".
+
+This model was contributed by `dqnguyen <https://huggingface.co/dqnguyen>`__. The original code can be found `here
+<https://github.com/VinAIResearch/BARTpho>`__.
+
+BartphoTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartphoTokenizer
+    :members:
--- a/docs/source/model_doc/beit.rst
+++ b/docs/source/model_doc/beit.rst
@@ -0,0 +1,119 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BEiT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BEiT model was proposed in `BEiT: BERT Pre-Training of Image Transformers <https://arxiv.org/abs/2106.08254>`__ by
+Hangbo Bao, Li Dong and Furu Wei. Inspired by BERT, BEiT is the first paper that makes self-supervised pre-training of
+Vision Transformers (ViTs) outperform supervised pre-training. Rather than pre-training the model to predict the class
+of an image (as done in the `original ViT paper <https://arxiv.org/abs/2010.11929>`__), BEiT models are pre-trained to
+predict visual tokens from the codebook of OpenAI's `DALL-E model <https://arxiv.org/abs/2102.12092>`__ given masked
+patches.
+
+The abstract from the paper is the following:
+
+*We introduce a self-supervised vision representation model BEiT, which stands for Bidirectional Encoder representation
+from Image Transformers. Following BERT developed in the natural language processing area, we propose a masked image
+modeling task to pretrain vision Transformers. Specifically, each image has two views in our pre-training, i.e, image
+patches (such as 16x16 pixels), and visual tokens (i.e., discrete tokens). We first "tokenize" the original image into
+visual tokens. Then we randomly mask some image patches and fed them into the backbone Transformer. The pre-training
+objective is to recover the original visual tokens based on the corrupted image patches. After pre-training BEiT, we
+directly fine-tune the model parameters on downstream tasks by appending task layers upon the pretrained encoder.
+Experimental results on image classification and semantic segmentation show that our model achieves competitive results
+with previous pre-training methods. For example, base-size BEiT achieves 83.2% top-1 accuracy on ImageNet-1K,
+significantly outperforming from-scratch DeiT training (81.8%) with the same setup. Moreover, large-size BEiT obtains
+86.3% only using ImageNet-1K, even outperforming ViT-L with supervised pre-training on ImageNet-22K (85.2%).*
+
+Tips:
+
+- BEiT models are regular Vision Transformers, but pre-trained in a self-supervised way rather than supervised. They
+  outperform both the original model (ViT) as well as Data-efficient Image Transformers (DeiT) when fine-tuned on
+  ImageNet-1K and CIFAR-100.
+- As the BEiT models expect each image to be of the same size (resolution), one can use
+  :class:`~transformers.BeitFeatureExtractor` to resize (or rescale) and normalize images for the model.
+- Both the patch resolution and image resolution used during pre-training or fine-tuning are reflected in the name of
+  each checkpoint. For example, :obj:`microsoft/beit-base-patch16-224` refers to a base-sized architecture with patch
+  resolution of 16x16 and fine-tuning resolution of 224x224. All checkpoints can be found on the `hub
+  <https://huggingface.co/models?search=microsoft/beit>`__.
+- The available checkpoints are either (1) pre-trained on `ImageNet-22k <http://www.image-net.org/>`__ (a collection of
+  14 million images and 22k classes) only, (2) also fine-tuned on ImageNet-22k or (3) also fine-tuned on `ImageNet-1k
+  <http://www.image-net.org/challenges/LSVRC/2012/>`__ (also referred to as ILSVRC 2012, a collection of 1.3 million
+  images and 1,000 classes).
+- BEiT uses relative position embeddings, inspired by the T5 model. During pre-training, the authors shared the
+  relative position bias among the several self-attention layers. During fine-tuning, each layer's relative position
+  bias is initialized with the shared relative position bias obtained after pre-training. Note that, if one wants to
+  pre-train a model from scratch, one needs to either set the :obj:`use_relative_position_bias` or the
+  :obj:`use_relative_position_bias` attribute of :class:`~transformers.BeitConfig` to :obj:`True` in order to add
+  position embeddings.
+
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The JAX/FLAX version of this model was
+contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found `here
+<https://github.com/microsoft/unilm/tree/master/beit>`__.
+
+BeitConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitConfig
+    :members:
+
+
+BeitFeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitFeatureExtractor
+    :members: __call__
+
+
+BeitModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitModel
+    :members: forward
+
+
+BeitForMaskedImageModeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitForMaskedImageModeling
+    :members: forward
+
+
+BeitForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeitForImageClassification
+    :members: forward
+
+
+FlaxBeitModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBeitModel
+    :members: __call__
+
+
+FlaxBeitForMaskedImageModeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBeitForMaskedImageModeling
+    :members: __call__
+
+
+FlaxBeitForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBeitForImageClassification
+    :members: __call__
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -76,6 +76,9 @@ Bert specific outputs
 .. autoclass:: transformers.models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
    :members:

+.. autoclass:: transformers.models.bert.modeling_flax_bert.FlaxBertForPreTrainingOutput
+    :members:
+

 BertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/bertweet.rst
+++ b/docs/source/model_doc/bertweet.rst
@@ -10,7 +10,7 @@
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.

-Bertweet
+BERTweet
 -----------------------------------------------------------------------------------------------------------------------

 Overview
--- a/docs/source/model_doc/blenderbot_small.rst
+++ b/docs/source/model_doc/blenderbot_small.rst
@@ -57,6 +57,13 @@ BlenderbotSmallTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+BlenderbotSmallTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallTokenizerFast
+    :members:
+
+
 BlenderbotSmallModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/byt5.rst
+++ b/docs/source/model_doc/byt5.rst
@@ -39,8 +39,11 @@ experiments.*
 This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
 found `here <https://github.com/google-research/byt5>`__.

+ByT5's architecture is based on the T5v1.1 model, so one can refer to :doc:`T5v1.1's documentation page <t5v1.1>`. They
+only differ in how inputs should be prepared for the model, see the code examples below.

-ByT5's architecture is based on the T5 model, so one can refer to :doc:`T5's documentation page <t5>`.
+Since ByT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.


 Example
--- a/docs/source/model_doc/deberta.rst
+++ b/docs/source/model_doc/deberta.rst
@@ -38,7 +38,8 @@ the training data performs consistently better on a wide range of NLP tasks, ach
 pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*


-This model was contributed by `DeBERTa <https://huggingface.co/DeBERTa>`__. The original code can be found `here
+This model was contributed by `DeBERTa <https://huggingface.co/DeBERTa>`__. This model TF 2.0 implementation was
+contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__ . The original code can be found `here
 <https://github.com/microsoft/DeBERTa>`__.


@@ -103,3 +104,45 @@ DebertaForQuestionAnswering

 .. autoclass:: transformers.DebertaForQuestionAnswering
    :members: forward
+
+
+TFDebertaModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaModel
+    :members: call
+
+
+TFDebertaPreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaPreTrainedModel
+    :members: call
+
+
+TFDebertaForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaForMaskedLM
+    :members: call
+
+
+TFDebertaForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaForSequenceClassification
+    :members: call
+
+
+TFDebertaForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaForTokenClassification
+    :members: call
+
+
+TFDebertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/deberta_v2.rst
+++ b/docs/source/model_doc/deberta_v2.rst
@@ -53,12 +53,13 @@ New in v2:
  transformer layer to better learn the local dependency of input tokens.
 - **Sharing position projection matrix with content projection matrix in attention layer** Based on previous
  experiments, this can save parameters without affecting the performance.
- **Apply bucket to encode relative postions** The DeBERTa-v2 model uses log bucket to encode relative positions
+- **Apply bucket to encode relative positions** The DeBERTa-v2 model uses log bucket to encode relative positions
  similar to T5.
 - **900M model & 1.5B model** Two additional model sizes are available: 900M and 1.5B, which significantly improves the
  performance of downstream tasks.

-This model was contributed by `DeBERTa <https://huggingface.co/DeBERTa>`__. The original code can be found `here
+This model was contributed by `DeBERTa <https://huggingface.co/DeBERTa>`__. This model TF 2.0 implementation was
+contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found `here
 <https://github.com/microsoft/DeBERTa>`__.


@@ -117,3 +118,45 @@ DebertaV2ForQuestionAnswering

 .. autoclass:: transformers.DebertaV2ForQuestionAnswering
    :members: forward
+
+
+TFDebertaV2Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2Model
+    :members: call
+
+
+TFDebertaV2PreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2PreTrainedModel
+    :members: call
+
+
+TFDebertaV2ForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2ForMaskedLM
+    :members: call
+
+
+TFDebertaV2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2ForSequenceClassification
+    :members: call
+
+
+TFDebertaV2ForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2ForTokenClassification
+    :members: call
+
+
+TFDebertaV2ForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDebertaV2ForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -44,8 +44,9 @@ Tips:
 - DistilBERT doesn't have options to select the input positions (:obj:`position_ids` input). This could be added if
  necessary though, just let us know if you need this option.

-This model was contributed by `victorsanh <https://huggingface.co/victorsanh>`__. The original code can be found
-:prefix_link:`here <examples/research-projects/distillation>`.
+This model was contributed by `victorsanh <https://huggingface.co/victorsanh>`__. This model jax version was
+contributed by `kamalkraj <https://huggingface.co/kamalkraj>`__. The original code can be found :prefix_link:`here
+<examples/research_projects/distillation>`.


 DistilBertConfig
@@ -152,3 +153,45 @@ TFDistilBertForQuestionAnswering

 .. autoclass:: transformers.TFDistilBertForQuestionAnswering
    :members: call
+
+
+FlaxDistilBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertModel
+    :members: __call__
+
+
+FlaxDistilBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForMaskedLM
+    :members: __call__
+
+
+FlaxDistilBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForSequenceClassification
+    :members: __call__
+
+
+FlaxDistilBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForMultipleChoice
+    :members: __call__
+
+
+FlaxDistilBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForTokenClassification
+    :members: __call__
+
+
+FlaxDistilBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxDistilBertForQuestionAnswering
+    :members: __call__
--- a/docs/source/model_doc/dpr.rst
+++ b/docs/source/model_doc/dpr.rst
@@ -41,6 +41,13 @@ DPRConfig
    :members:


+DPRPreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRPreTrainedModel
+    :members:
+
+
 DPRContextEncoderTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/encoderdecoder.rst
+++ b/docs/source/model_doc/encoderdecoder.rst
@@ -27,6 +27,25 @@ An application of this architecture could be to leverage two pretrained :class:`
 and decoder for a summarization model as was shown in: `Text Summarization with Pretrained Encoders
 <https://arxiv.org/abs/1908.08345>`__ by Yang Liu and Mirella Lapata.

+The :meth:`~transformers.TFEncoderDecoderModel.from_pretrained` currently doesn't support initializing the model from a
+pytorch checkpoint. Passing ``from_pt=True`` to this method will throw an exception. If there are only pytorch
+checkpoints for a particular encoder-decoder model, a workaround is:
+
+.. code-block::
+
+    >>> # a workaround to load from pytorch checkpoint
+    >>> _model = EncoderDecoderModel.from_pretrained("patrickvonplaten/bert2bert-cnn_dailymail-fp16")
+    >>> _model.encoder.save_pretrained("./encoder")
+    >>> _model.decoder.save_pretrained("./decoder")
+    >>> model = TFEncoderDecoderModel.from_encoder_decoder_pretrained(
+    ...     "./encoder", "./decoder", encoder_from_pt=True, decoder_from_pt=True
+    ... )
+    >>> # This is only for copying some specific attributes of this particular model.
+    >>> model.config = _model.config
+
+This model was contributed by `thomwolf <https://github.com/thomwolf>`__. This model's TensorFlow and Flax versions
+were contributed by `ydshieh <https://github.com/ydshieh>`__.
+

 EncoderDecoderConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -40,3 +59,17 @@ EncoderDecoderModel

 .. autoclass:: transformers.EncoderDecoderModel
    :members: forward, from_encoder_decoder_pretrained
+
+
+TFEncoderDecoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFEncoderDecoderModel
+    :members: call, from_encoder_decoder_pretrained
+
+
+FlaxEncoderDecoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxEncoderDecoderModel
+    :members: __call__, from_encoder_decoder_pretrained
--- a/docs/source/model_doc/fnet.rst
+++ b/docs/source/model_doc/fnet.rst
@@ -0,0 +1,121 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+FNet
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The FNet model was proposed in `FNet: Mixing Tokens with Fourier Transforms <https://arxiv.org/abs/2105.03824>`__ by
+James Lee-Thorp, Joshua Ainslie, Ilya Eckstein, Santiago Ontanon. The model replaces the self-attention layer in a BERT
+model with a fourier transform which returns only the real parts of the transform. The model is significantly faster
+than the BERT model because it has fewer parameters and is more memory efficient. The model achieves about 92-97%
+accuracy of BERT counterparts on GLUE benchmark, and trains much faster than the BERT model. The abstract from the
+paper is the following:
+
+*We show that Transformer encoder architectures can be sped up, with limited accuracy costs, by replacing the
+self-attention sublayers with simple linear transformations that "mix" input tokens. These linear mixers, along with
+standard nonlinearities in feed-forward layers, prove competent at modeling semantic relationships in several text
+classification tasks. Most surprisingly, we find that replacing the self-attention sublayer in a Transformer encoder
+with a standard, unparameterized Fourier Transform achieves 92-97% of the accuracy of BERT counterparts on the GLUE
+benchmark, but trains 80% faster on GPUs and 70% faster on TPUs at standard 512 input lengths. At longer input lengths,
+our FNet model is significantly faster: when compared to the "efficient" Transformers on the Long Range Arena
+benchmark, FNet matches the accuracy of the most accurate models, while outpacing the fastest models across all
+sequence lengths on GPUs (and across relatively shorter lengths on TPUs). Finally, FNet has a light memory footprint
+and is particularly efficient at smaller model sizes; for a fixed speed and accuracy budget, small FNet models
+outperform Transformer counterparts.*
+
+Tips on usage:
+
+- The model was trained without an attention mask as it is based on Fourier Transform. The model was trained with
+  maximum sequence length 512 which includes pad tokens. Hence, it is highly recommended to use the same maximum
+  sequence length for fine-tuning and inference.
+
+This model was contributed by `gchhablani <https://huggingface.co/gchhablani>`__. The original code can be found `here
+<https://github.com/google-research/google-research/tree/master/f_net>`__.
+
+FNetConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetConfig
+    :members:
+
+
+FNetTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+FNetTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetTokenizerFast
+    :members:
+
+
+FNetModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetModel
+    :members: forward
+
+
+FNetForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForPreTraining
+    :members: forward
+
+
+FNetForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForMaskedLM
+    :members: forward
+
+
+FNetForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForNextSentencePrediction
+    :members: forward
+
+FNetForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForSequenceClassification
+    :members: forward
+
+
+FNetForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForMultipleChoice
+    :members: forward
+
+
+FNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForTokenClassification
+    :members: forward
+
+
+FNetForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FNetForQuestionAnswering
+    :members: forward
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -36,10 +36,13 @@ Tips:
 - GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
  observed in the `run_generation.py` example script.
- The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
-  this `past` value prevents the model from re-computing pre-computed values in the context of text generation. See
-  `reusing the past in generative models <../quickstart.html#using-the-past>`__ for more information on the usage of
-  this argument.
+- The model can take the `past_key_values` (for PyTorch) or `past` (for TF) as input, which is the previously computed
+  key/value attention pairs. Using this (`past_key_values` or `past`) value prevents the model from re-computing
+  pre-computed values in the context of text generation. For PyTorch, see `past_key_values` argument of the
+  :meth:`~transformers.GPT2Model.forward` method, or for TF the `past` argument of the
+  :meth:`~transformers.TFGPT2Model.call` method for more information on its usage.
+- Enabling the `scale_attn_by_inverse_layer_idx` and `reorder_and_upcast_attn` flags will apply the training stability
+  improvements from `Mistral <https://github.com/stanford-crfm/mistral/>`__ (for PyTorch only).

 `Write With Transformer <https://transformer.huggingface.co/doc/gpt2-large>`__ is a webapp created and hosted by
 Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
@@ -108,6 +111,13 @@ GPT2ForSequenceClassification
    :members: forward


+GPT2ForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPT2ForTokenClassification
+    :members: forward
+
+
 TFGPT2Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/gptj.rst
+++ b/docs/source/model_doc/gptj.rst
@@ -0,0 +1,121 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+GPT-J
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The GPT-J model was released in the `kingoflolz/mesh-transformer-jax
+<https://github.com/kingoflolz/mesh-transformer-jax>`__ repository by Ben Wang and Aran Komatsuzaki. It is a GPT-2-like
+causal language model trained on `the Pile <https://pile.eleuther.ai/>`__ dataset.
+
+This model was contributed by `Stella Biderman <https://huggingface.co/stellaathena>`__.
+
+Tips:
+
+- To load `GPT-J <https://huggingface.co/EleutherAI/gpt-j-6B>`__ in float32 one would need at least 2x model size CPU
+  RAM: 1x for initial weights and another 1x to load the checkpoint. So for GPT-J it would take at least 48GB of CPU
+  RAM to just load the model. To reduce the CPU RAM usage there are a few options. The ``torch_dtype`` argument can be
+  used to initialize the model in half-precision. And the ``low_cpu_mem_usage`` argument can be used to keep the RAM
+  usage to 1x. There is also a `fp16 branch <https://huggingface.co/EleutherAI/gpt-j-6B/tree/float16>`__ which stores
+  the fp16 weights, which could be used to further minimize the RAM usage. Combining all this it should take roughly
+  12.1GB of CPU RAM to load the model.
+
+.. code-block::
+
+    >>> from transformers import GPTJForCausalLM
+    >>> import torch
+
+    >>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", revision="float16", torch_dtype=torch.float16, low_cpu_mem_usage=True)
+
+
+- The model should fit on 16GB GPU for inference. For training/fine-tuning it would take much more GPU RAM. Adam
+  optimizer for example makes four copies of the model: model, gradients, average and squared average of the gradients.
+  So it would need at least 4x model size GPU memory, even with mixed precision as gradient updates are in fp32. This
+  is not including the activations and data batches, which would again require some more GPU RAM. So one should explore
+  solutions such as DeepSpeed, to train/fine-tune the model. Another option is to use the original codebase to
+  train/fine-tune the model on TPU and then convert the model to Transformers format for inference. Instructions for
+  that could be found `here <https://github.com/kingoflolz/mesh-transformer-jax/blob/master/howto_finetune.md>`__
+
+- Although the embedding matrix has a size of 50400, only 50257 entries are used by the GPT-2 tokenizer. These extra
+  tokens are added for the sake of efficiency on TPUs. To avoid the mis-match between embedding matrix size and vocab
+  size, the tokenizer for `GPT-J <https://huggingface.co/EleutherAI/gpt-j-6B>`__ contains 143 extra tokens
+  ``<|extratoken_1|>... <|extratoken_143|>``, so the ``vocab_size`` of tokenizer also becomes 50400.
+
+Generation
+_______________________________________________________________________________________________________________________
+
+The :meth:`~transformers.generation_utils.GenerationMixin.generate` method can be used to generate text using GPT-J
+model.
+
+.. code-block::
+
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer
+    >>> model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6B")
+    >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
+
+    >>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
+    ...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
+    ...          "researchers was the fact that the unicorns spoke perfect English."
+
+    >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+
+    >>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
+    >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
+
+...or in float16 precision:
+
+.. code-block::
+
+    >>> from transformers import GPTJForCausalLM, AutoTokenizer
+    >>> import torch
+
+    >>> model = GPTJForCausalLM.from_pretrained("EleutherAI/gpt-j-6B", torch_dtype=torch.float16)
+    >>> tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")
+
+    >>> prompt = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
+    ...          "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
+    ...          "researchers was the fact that the unicorns spoke perfect English."
+
+    >>> input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+
+    >>> gen_tokens = model.generate(input_ids, do_sample=True, temperature=0.9, max_length=100,)
+    >>> gen_text = tokenizer.batch_decode(gen_tokens)[0]
+
+
+GPTJConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTJConfig
+    :members:
+
+GPTJModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTJModel
+    :members: forward
+
+
+GPTJForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTJForCausalLM
+    :members: forward
+
+
+GPTJForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPTJForSequenceClassification
+    :members: forward
--- a/docs/source/model_doc/herbert.rst
+++ b/docs/source/model_doc/herbert.rst
@@ -10,13 +10,13 @@
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.

-herBERT
+HerBERT
 -----------------------------------------------------------------------------------------------------------------------

 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The herBERT model was proposed in `KLEJ: Comprehensive Benchmark for Polish Language Understanding
+The HerBERT model was proposed in `KLEJ: Comprehensive Benchmark for Polish Language Understanding
 <https://www.aclweb.org/anthology/2020.acl-main.111.pdf>`__ by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, and
 Ireneusz Gawlik. It is a BERT-based Language Model trained on Polish Corpora using only MLM objective with dynamic
 masking of whole words.
--- a/docs/source/model_doc/hubert.rst
+++ b/docs/source/model_doc/hubert.rst
@@ -64,6 +64,14 @@ HubertForCTC
 .. autoclass:: transformers.HubertForCTC
    :members: forward

+
+HubertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.HubertForSequenceClassification
+    :members: forward
+
+
 TFHubertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/layoutlmv2.rst
+++ b/docs/source/model_doc/layoutlmv2.rst
@@ -0,0 +1,314 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+LayoutLMV2
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The LayoutLMV2 model was proposed in `LayoutLMv2: Multi-modal Pre-training for Visually-Rich Document Understanding
+<https://arxiv.org/abs/2012.14740>`__ by Yang Xu, Yiheng Xu, Tengchao Lv, Lei Cui, Furu Wei, Guoxin Wang, Yijuan Lu,
+Dinei Florencio, Cha Zhang, Wanxiang Che, Min Zhang, Lidong Zhou. LayoutLMV2 improves `LayoutLM
+<https://huggingface.co/transformers/model_doc/layoutlm.html>`__ to obtain state-of-the-art results across several
+document image understanding benchmarks:
+
+- information extraction from scanned documents: the `FUNSD <https://guillaumejaume.github.io/FUNSD/>`__ dataset (a
+  collection of 199 annotated forms comprising more than 30,000 words), the `CORD <https://github.com/clovaai/cord>`__
+  dataset (a collection of 800 receipts for training, 100 for validation and 100 for testing), the `SROIE
+  <https://rrc.cvc.uab.es/?ch=13>`__ dataset (a collection of 626 receipts for training and 347 receipts for testing)
+  and the `Kleister-NDA <https://github.com/applicaai/kleister-nda>`__ dataset (a collection of non-disclosure
+  agreements from the EDGAR database, including 254 documents for training, 83 documents for validation, and 203
+  documents for testing).
+- document image classification: the `RVL-CDIP <https://www.cs.cmu.edu/~aharley/rvl-cdip/>`__ dataset (a collection of
+  400,000 images belonging to one of 16 classes).
+- document visual question answering: the `DocVQA <https://arxiv.org/abs/2007.00398>`__ dataset (a collection of 50,000
+  questions defined on 12,000+ document images).
+
+The abstract from the paper is the following:
+
+*Pre-training of text and layout has proved effective in a variety of visually-rich document understanding tasks due to
+its effective model architecture and the advantage of large-scale unlabeled scanned/digital-born documents. In this
+paper, we present LayoutLMv2 by pre-training text, layout and image in a multi-modal framework, where new model
+architectures and pre-training tasks are leveraged. Specifically, LayoutLMv2 not only uses the existing masked
+visual-language modeling task but also the new text-image alignment and text-image matching tasks in the pre-training
+stage, where cross-modality interaction is better learned. Meanwhile, it also integrates a spatial-aware self-attention
+mechanism into the Transformer architecture, so that the model can fully understand the relative positional
+relationship among different text blocks. Experiment results show that LayoutLMv2 outperforms strong baselines and
+achieves new state-of-the-art results on a wide variety of downstream visually-rich document understanding tasks,
+including FUNSD (0.7895 -> 0.8420), CORD (0.9493 -> 0.9601), SROIE (0.9524 -> 0.9781), Kleister-NDA (0.834 -> 0.852),
+RVL-CDIP (0.9443 -> 0.9564), and DocVQA (0.7295 -> 0.8672). The pre-trained LayoutLMv2 model is publicly available at
+this https URL.*
+
+Tips:
+
+- The main difference between LayoutLMv1 and LayoutLMv2 is that the latter incorporates visual embeddings during
+  pre-training (while LayoutLMv1 only adds visual embeddings during fine-tuning).
+- LayoutLMv2 adds both a relative 1D attention bias as well as a spatial 2D attention bias to the attention scores in
+  the self-attention layers. Details can be found on page 5 of the `paper <https://arxiv.org/abs/2012.14740>`__.
+- Demo notebooks on how to use the LayoutLMv2 model on RVL-CDIP, FUNSD, DocVQA, CORD can be found `here
+  <https://github.com/NielsRogge/Transformers-Tutorials>`__.
+- LayoutLMv2 uses Facebook AI's `Detectron2 <https://github.com/facebookresearch/detectron2/>`__ package for its visual
+  backbone. See `this link <https://detectron2.readthedocs.io/en/latest/tutorials/install.html>`__ for installation
+  instructions.
+- In addition to :obj:`input_ids`, :meth:`~transformer.LayoutLMv2Model.forward` expects 2 additional inputs, namely
+  :obj:`image` and :obj:`bbox`. The :obj:`image` input corresponds to the original document image in which the text
+  tokens occur. The model expects each document image to be of size 224x224. This means that if you have a batch of
+  document images, :obj:`image` should be a tensor of shape (batch_size, 3, 224, 224). This can be either a
+  :obj:`torch.Tensor` or a :obj:`Detectron2.structures.ImageList`. You don't need to normalize the channels, as this is
+  done by the model. Important to note is that the visual backbone expects BGR channels instead of RGB, as all models
+  in Detectron2 are pre-trained using the BGR format. The :obj:`bbox` input are the bounding boxes (i.e. 2D-positions)
+  of the input text tokens. This is identical to :class:`~transformer.LayoutLMModel`. These can be obtained using an
+  external OCR engine such as Google's `Tesseract <https://github.com/tesseract-ocr/tesseract>`__ (there's a `Python
+  wrapper <https://pypi.org/project/pytesseract/>`__ available). Each bounding box should be in (x0, y0, x1, y1)
+  format, where (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1, y1)
+  represents the position of the lower right corner. Note that one first needs to normalize the bounding boxes to be on
+  a 0-1000 scale. To normalize, you can use the following function:
+
+.. code-block::
+
+    def normalize_bbox(bbox, width, height):
+         return [
+             int(1000 * (bbox[0] / width)),
+             int(1000 * (bbox[1] / height)),
+             int(1000 * (bbox[2] / width)),
+             int(1000 * (bbox[3] / height)),
+         ]
+
+Here, :obj:`width` and :obj:`height` correspond to the width and height of the original document in which the token
+occurs (before resizing the image). Those can be obtained using the Python Image Library (PIL) library for example, as
+follows:
+
+.. code-block::
+
+    from PIL import Image
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.")
+
+    width, height = image.size
+
+However, this model includes a brand new :class:`~transformer.LayoutLMv2Processor` which can be used to directly
+prepare data for the model (including applying OCR under the hood). More information can be found in the "Usage"
+section below.
+
+- Internally, :class:`~transformer.LayoutLMv2Model` will send the :obj:`image` input through its visual backbone to
+  obtain a lower-resolution feature map, whose shape is equal to the :obj:`image_feature_pool_shape` attribute of
+  :class:`~transformer.LayoutLMv2Config`. This feature map is then flattened to obtain a sequence of image tokens. As
+  the size of the feature map is 7x7 by default, one obtains 49 image tokens. These are then concatenated with the text
+  tokens, and send through the Transformer encoder. This means that the last hidden states of the model will have a
+  length of 512 + 49 = 561, if you pad the text tokens up to the max length. More generally, the last hidden states
+  will have a shape of :obj:`seq_length` + :obj:`image_feature_pool_shape[0]` *
+  :obj:`config.image_feature_pool_shape[1]`.
+- When calling :meth:`~transformer.LayoutLMv2Model.from_pretrained`, a warning will be printed with a long list of
+  parameter names that are not initialized. This is not a problem, as these parameters are batch normalization
+  statistics, which are going to have values when fine-tuning on a custom dataset.
+- If you want to train the model in a distributed environment, make sure to call :meth:`synchronize_batch_norm` on the
+  model in order to properly synchronize the batch normalization layers of the visual backbone.
+
+In addition, there's LayoutXLM, which is a multilingual version of LayoutLMv2. More information can be found on
+:doc:`LayoutXLM's documentation page <layoutxlm>`.
+
+Usage: LayoutLMv2Processor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The easiest way to prepare data for the model is to use :class:`~transformer.LayoutLMv2Processor`, which internally
+combines a feature extractor (:class:`~transformer.LayoutLMv2FeatureExtractor`) and a tokenizer
+(:class:`~transformer.LayoutLMv2Tokenizer` or :class:`~transformer.LayoutLMv2TokenizerFast`). The feature extractor
+handles the image modality, while the tokenizer handles the text modality. A processor combines both, which is ideal
+for a multi-modal model like LayoutLMv2. Note that you can still use both separately, if you only want to handle one
+modality.
+
+.. code-block::
+
+    from transformers import LayoutLMv2FeatureExtractor, LayoutLMv2TokenizerFast, LayoutLMv2Processor
+
+    feature_extractor = LayoutLMv2FeatureExtractor() # apply_ocr is set to True by default
+    tokenizer = LayoutLMv2TokenizerFast.from_pretrained("microsoft/layoutlmv2-base-uncased")
+    processor = LayoutLMv2Processor(feature_extractor, tokenizer)
+
+In short, one can provide a document image (and possibly additional data) to :class:`~transformer.LayoutLMv2Processor`,
+and it will create the inputs expected by the model. Internally, the processor first uses
+:class:`~transformer.LayoutLMv2FeatureExtractor` to apply OCR on the image to get a list of words and normalized
+bounding boxes, as well to resize the image to a given size in order to get the :obj:`image` input. The words and
+normalized bounding boxes are then provided to :class:`~transformer.LayoutLMv2Tokenizer` or
+:class:`~transformer.LayoutLMv2TokenizerFast`, which converts them to token-level :obj:`input_ids`,
+:obj:`attention_mask`, :obj:`token_type_ids`, :obj:`bbox`. Optionally, one can provide word labels to the processor,
+which are turned into token-level :obj:`labels`.
+
+:class:`~transformer.LayoutLMv2Processor` uses `PyTesseract <https://pypi.org/project/pytesseract/>`__, a Python
+wrapper around Google's Tesseract OCR engine, under the hood. Note that you can still use your own OCR engine of
+choice, and provide the words and normalized boxes yourself. This requires initializing
+:class:`~transformer.LayoutLMv2FeatureExtractor` with :obj:`apply_ocr` set to :obj:`False`.
+
+In total, there are 5 use cases that are supported by the processor. Below, we list them all. Note that each of these
+use cases work for both batched and non-batched inputs (we illustrate them for non-batched inputs).
+
+**Use case 1: document image classification (training, inference) + token classification (inference), apply_ocr =
+True**
+
+This is the simplest case, in which the processor (actually the feature extractor) will perform OCR on the image to get
+the words and normalized bounding boxes.
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    encoding = processor(image, return_tensors="pt") # you can also add all tokenizer parameters here such as padding, truncation
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+
+**Use case 2: document image classification (training, inference) + token classification (inference), apply_ocr=False**
+
+In case one wants to do OCR themselves, one can initialize the feature extractor with :obj:`apply_ocr` set to
+:obj:`False`. In that case, one should provide the words and corresponding (normalized) bounding boxes themselves to
+the processor.
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    words = ["hello", "world"]
+    boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+    encoding = processor(image, words, boxes=boxes, return_tensors="pt")
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+
+**Use case 3: token classification (training), apply_ocr=False**
+
+For token classification tasks (such as FUNSD, CORD, SROIE, Kleister-NDA), one can also provide the corresponding word
+labels in order to train a model. The processor will then convert these into token-level :obj:`labels`. By default, it
+will only label the first wordpiece of a word, and label the remaining wordpieces with -100, which is the
+:obj:`ignore_index` of PyTorch's CrossEntropyLoss. In case you want all wordpieces of a word to be labeled, you can
+initialize the tokenizer with :obj:`only_label_first_subword` set to :obj:`False`.
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    words = ["hello", "world"]
+    boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+    word_labels = [1, 2]
+    encoding = processor(image, words, boxes=boxes, word_labels=word_labels, return_tensors="pt")
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'labels', 'image'])
+
+**Use case 4: visual question answering (inference), apply_ocr=True**
+
+For visual question answering tasks (such as DocVQA), you can provide a question to the processor. By default, the
+processor will apply OCR on the image, and create [CLS] question tokens [SEP] word tokens [SEP].
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    question = "What's his name?"
+    encoding = processor(image, question, return_tensors="pt") 
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+
+**Use case 5: visual question answering (inference), apply_ocr=False**
+
+For visual question answering tasks (such as DocVQA), you can provide a question to the processor. If you want to
+perform OCR yourself, you can provide your own words and (normalized) bounding boxes to the processor.
+
+.. code-block::
+
+    from transformers import LayoutLMv2Processor
+    from PIL import Image
+
+    processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased", revision="no_ocr")
+
+    image = Image.open("name_of_your_document - can be a png file, pdf, etc.").convert("RGB")
+    question = "What's his name?"
+    words = ["hello", "world"]
+    boxes = [[1, 2, 3, 4], [5, 6, 7, 8]] # make sure to normalize your bounding boxes
+    encoding = processor(image, question, words, boxes=boxes, return_tensors="pt")  
+    print(encoding.keys())
+    # dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'bbox', 'image'])
+
+LayoutLMv2Config
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2Config
+    :members:
+
+
+LayoutLMv2FeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2FeatureExtractor
+    :members: __call__
+
+
+LayoutLMv2Tokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2Tokenizer
+    :members: __call__, save_vocabulary
+
+
+LayoutLMv2TokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2TokenizerFast
+    :members: __call__
+
+
+LayoutLMv2Processor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2Processor
+    :members: __call__
+
+
+LayoutLMv2Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2Model
+    :members: forward
+
+
+LayoutLMv2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2ForSequenceClassification
+    :members:
+
+
+LayoutLMv2ForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2ForTokenClassification
+    :members:
+
+
+LayoutLMv2ForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMv2ForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/layoutxlm.rst
+++ b/docs/source/model_doc/layoutxlm.rst
@@ -0,0 +1,56 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+LayoutXLM
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+LayoutXLM was proposed in `LayoutXLM: Multimodal Pre-training for Multilingual Visually-rich Document Understanding
+<https://arxiv.org/abs/2104.08836>`__ by Yiheng Xu, Tengchao Lv, Lei Cui, Guoxin Wang, Yijuan Lu, Dinei Florencio, Cha
+Zhang, Furu Wei. It's a multilingual extension of the `LayoutLMv2 model <https://arxiv.org/abs/2012.14740>`__ trained
+on 53 languages.
+
+The abstract from the paper is the following:
+
+*Multimodal pre-training with text, layout, and image has achieved SOTA performance for visually-rich document
+understanding tasks recently, which demonstrates the great potential for joint learning across different modalities. In
+this paper, we present LayoutXLM, a multimodal pre-trained model for multilingual document understanding, which aims to
+bridge the language barriers for visually-rich document understanding. To accurately evaluate LayoutXLM, we also
+introduce a multilingual form understanding benchmark dataset named XFUN, which includes form understanding samples in
+7 languages (Chinese, Japanese, Spanish, French, Italian, German, Portuguese), and key-value pairs are manually labeled
+for each language. Experiment results show that the LayoutXLM model has significantly outperformed the existing SOTA
+cross-lingual pre-trained models on the XFUN dataset.*
+
+One can directly plug in the weights of LayoutXLM into a LayoutLMv2 model, like so:
+
+.. code-block::
+
+    from transformers import LayoutLMv2Model
+
+    model = LayoutLMv2Model.from_pretrained('microsoft/layoutxlm-base') 
+
+Note that LayoutXLM requires a different tokenizer, based on :class:`~transformers.XLMRobertaTokenizer`. You can
+initialize it as follows:
+
+.. code-block::
+
+    from transformers import AutoTokenizer
+
+    tokenizer = AutoTokenizer.from_pretrained('microsoft/layoutxlm-base') 
+
+As LayoutXLM's architecture is equivalent to that of LayoutLMv2, one can refer to :doc:`LayoutLMv2's documentation page
+<layoutlmv2>` for all tips, code examples and notebooks.
+
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
+<https://github.com/microsoft/unilm>`__.
--- a/docs/source/model_doc/led.rst
+++ b/docs/source/model_doc/led.rst
@@ -46,8 +46,8 @@ Tips:
 - LED makes use of *global attention* by means of the ``global_attention_mask`` (see
  :class:`~transformers.LongformerModel`). For summarization, it is advised to put *global attention* only on the first
  ``<s>`` token. For question answering, it is advised to put *global attention* on all tokens of the question.
- To fine-tune LED on all 16384, it is necessary to enable *gradient checkpointing* by setting
-  ``config.gradient_checkpointing = True``.
+- To fine-tune LED on all 16384, it is necessary to enable *gradient checkpointing* by executing
+  ``model.gradient_checkpointing_enable()``.
 - A notebook showing how to evaluate LED, can be accessed `here
  <https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing>`__.
 - A notebook showing how to fine-tune LED, can be accessed `here
--- a/docs/source/model_doc/m2m_100.rst
+++ b/docs/source/model_doc/m2m_100.rst
@@ -58,7 +58,7 @@ examples. To install :obj:`sentencepiece` run ``pip install sentencepiece``.
    tokenizer = M2M100Tokenizer.from_pretrained('facebook/m2m100_418M', src_lang="en", tgt_lang="fr")

    src_text = "Life is like a box of chocolates."
-    tgt_lang = "La vie est comme une boîte de chocolat."
+    tgt_text = "La vie est comme une boîte de chocolat."

    model_inputs = tokenizer(src_text, return_tensors="pt")
    with tokenizer.as_target_tokenizer():
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -103,8 +103,8 @@ Here is the code to see all available pretrained models on the hub:

 .. code-block:: python

-    from transformers.hf_api import HfApi
-    model_list = HfApi().model_list()
+    from huggingface_hub.hf_api import HfApi
+    model_list = HfApi().list_models()
    org = "Helsinki-NLP"
    model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
    suffix = [x.split('/')[1] for x in model_ids]
--- a/docs/source/model_doc/mbart.rst
+++ b/docs/source/model_doc/mbart.rst
@@ -49,11 +49,11 @@ inside the context manager :meth:`~transformers.MBartTokenizer.as_target_tokeniz

    >>> from transformers import MBartForConditionalGeneration, MBartTokenizer

-    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
+    >>> tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro", src_lang="en_XX", tgt_lang="ro_RO")
    >>> example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
    >>> expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"

-    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt", src_lang="en_XX", tgt_lang="ro_RO")
+    >>> inputs = tokenizer(example_english_phrase, return_tensors="pt")
    >>> with tokenizer.as_target_tokenizer():
    ...     labels = tokenizer(expected_translation_romanian, return_tensors="pt")

--- a/docs/source/model_doc/mt5.rst
+++ b/docs/source/model_doc/mt5.rst
@@ -10,7 +10,7 @@
    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
    specific language governing permissions and limitations under the License.

-MT5
+mT5
 -----------------------------------------------------------------------------------------------------------------------

 Overview
@@ -24,9 +24,28 @@ The abstract from the paper is the following:

 *The recent "Text-to-Text Transfer Transformer" (T5) leveraged a unified text-to-text format and scale to attain
 state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a
-multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We describe
+multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We detail
 the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual
-benchmarks. All of the code and model checkpoints*
+benchmarks. We also describe a simple technique to prevent "accidental translation" in the zero-shot setting, where a
+generative model chooses to (partially) translate its prediction into the wrong language. All of the code and model
+checkpoints used in this work are publicly available.*
+
+Note: mT5 was only pre-trained on `mC4 <https://huggingface.co/datasets/mc4>`__ excluding any supervised training.
+Therefore, this model has to be fine-tuned before it is useable on a downstream task, unlike the original T5 model.
+Since mT5 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
+
+Google has released the following variants:
+
+- `google/mt5-small <https://huggingface.co/google/mt5-small>`__
+
+- `google/mt5-base <https://huggingface.co/google/mt5-base>`__
+
+- `google/mt5-large <https://huggingface.co/google/mt5-large>`__
+
+- `google/mt5-xl <https://huggingface.co/google/mt5-xl>`__
+
+- `google/mt5-xxl <https://huggingface.co/google/mt5-xxl>`__.

 This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
 found `here <https://github.com/google-research/multilingual-t5>`__.
@@ -94,3 +113,17 @@ TFMT5EncoderModel

 .. autoclass:: transformers.TFMT5EncoderModel
    :members:
+
+
+FlaxMT5Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMT5Model
+    :members:
+
+
+FlaxMT5ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMT5ForConditionalGeneration
+    :members:
--- a/docs/source/model_doc/pegasus.rst
+++ b/docs/source/model_doc/pegasus.rst
@@ -152,3 +152,17 @@ TFPegasusForConditionalGeneration

 .. autoclass:: transformers.TFPegasusForConditionalGeneration
    :members: call
+
+
+FlaxPegasusModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxPegasusModel
+    :members: __call__, encode, decode
+
+
+FlaxPegasusForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxPegasusForConditionalGeneration
+    :members: __call__, encode, decode
--- a/docs/source/model_doc/phobert.rst
+++ b/docs/source/model_doc/phobert.rst
@@ -50,7 +50,8 @@ Example of use:
    >>> # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")


-    This model was contributed by `dqnguyen <https://huggingface.co/dqnguyen>`__. The original code can be found `here <https://github.com/VinAIResearch/PhoBERT>`__.
+This model was contributed by `dqnguyen <https://huggingface.co/dqnguyen>`__. The original code can be found `here
+<https://github.com/VinAIResearch/PhoBERT>`__.

 PhobertTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/rembert.rst
+++ b/docs/source/model_doc/rembert.rst
@@ -0,0 +1,161 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+RemBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The RemBERT model was proposed in `Rethinking Embedding Coupling in Pre-trained Language Models
+<https://arxiv.org/abs/2010.12821>`__ by Hyung Won Chung, Thibault Févry, Henry Tsai, Melvin Johnson, Sebastian Ruder.
+
+The abstract from the paper is the following:
+
+*We re-evaluate the standard practice of sharing weights between input and output embeddings in state-of-the-art
+pre-trained language models. We show that decoupled embeddings provide increased modeling flexibility, allowing us to
+significantly improve the efficiency of parameter allocation in the input embedding of multilingual models. By
+reallocating the input embedding parameters in the Transformer layers, we achieve dramatically better performance on
+standard natural language understanding tasks with the same number of parameters during fine-tuning. We also show that
+allocating additional capacity to the output embedding provides benefits to the model that persist through the
+fine-tuning stage even though the output embedding is discarded after pre-training. Our analysis shows that larger
+output embeddings prevent the model's last layers from overspecializing to the pre-training task and encourage
+Transformer representations to be more general and more transferable to other tasks and languages. Harnessing these
+findings, we are able to train models that achieve strong performance on the XTREME benchmark without increasing the
+number of parameters at the fine-tuning stage.*
+
+Tips:
+
+For fine-tuning, RemBERT can be thought of as a bigger version of mBERT with an ALBERT-like factorization of the
+embedding layer. The embeddings are not tied in pre-training, in contrast with BERT, which enables smaller input
+embeddings (preserved during fine-tuning) and bigger output embeddings (discarded at fine-tuning). The tokenizer is
+also similar to the Albert one rather than the BERT one.
+
+RemBertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertConfig
+    :members:
+
+
+RemBertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+RemBertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertTokenizerFast
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+RemBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertModel
+    :members: forward
+
+
+RemBertForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForCausalLM
+    :members: forward
+
+
+RemBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForMaskedLM
+    :members: forward
+
+
+RemBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForSequenceClassification
+    :members: forward
+
+
+RemBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForMultipleChoice
+    :members: forward
+
+
+RemBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForTokenClassification
+    :members: forward
+
+
+RemBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RemBertForQuestionAnswering
+    :members: forward
+
+
+TFRemBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertModel
+    :members: call
+
+
+TFRemBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForMaskedLM
+    :members: call
+
+
+TFRemBertForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForCausalLM
+    :members: call
+
+
+TFRemBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForSequenceClassification
+    :members: call
+
+
+TFRemBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForMultipleChoice
+    :members: call
+
+
+TFRemBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForTokenClassification
+    :members: call
+
+
+TFRemBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRemBertForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -126,6 +126,13 @@ TFRobertaModel
    :members: call


+TFRobertaForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRobertaForCausalLM
+    :members: call
+
+
 TFRobertaForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/segformer.rst
+++ b/docs/source/model_doc/segformer.rst
@@ -0,0 +1,80 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+SegFormer
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The SegFormer model was proposed in `SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers
+<https://arxiv.org/abs/2105.15203>`__ by Enze Xie, Wenhai Wang, Zhiding Yu, Anima Anandkumar, Jose M. Alvarez, Ping
+Luo. The model consists of a hierarchical Transformer encoder and a lightweight all-MLP decode head to achieve great
+results on image segmentation benchmarks such as ADE20K and Cityscapes.
+
+The abstract from the paper is the following:
+
+*We present SegFormer, a simple, efficient yet powerful semantic segmentation framework which unifies Transformers with
+lightweight multilayer perception (MLP) decoders. SegFormer has two appealing features: 1) SegFormer comprises a novel
+hierarchically structured Transformer encoder which outputs multiscale features. It does not need positional encoding,
+thereby avoiding the interpolation of positional codes which leads to decreased performance when the testing resolution
+differs from training. 2) SegFormer avoids complex decoders. The proposed MLP decoder aggregates information from
+different layers, and thus combining both local attention and global attention to render powerful representations. We
+show that this simple and lightweight design is the key to efficient segmentation on Transformers. We scale our
+approach up to obtain a series of models from SegFormer-B0 to SegFormer-B5, reaching significantly better performance
+and efficiency than previous counterparts. For example, SegFormer-B4 achieves 50.3% mIoU on ADE20K with 64M parameters,
+being 5x smaller and 2.2% better than the previous best method. Our best model, SegFormer-B5, achieves 84.0% mIoU on
+Cityscapes validation set and shows excellent zero-shot robustness on Cityscapes-C.*
+
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
+<https://github.com/NVlabs/SegFormer>`__.
+
+SegformerConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SegformerConfig
+    :members:
+
+
+SegformerFeatureExtractor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SegformerFeatureExtractor
+    :members: __call__
+
+
+SegformerModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SegformerModel
+    :members: forward
+
+
+SegformerDecodeHead
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SegformerDecodeHead
+    :members: forward
+
+
+SegformerForImageClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SegformerForImageClassification
+    :members: forward
+
+
+SegformerForSemanticSegmentation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SegformerForSemanticSegmentation
+    :members: forward
--- a/docs/source/model_doc/sew.rst
+++ b/docs/source/model_doc/sew.rst
@@ -0,0 +1,67 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+SEW
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+SEW (Squeezed and Efficient Wav2Vec) was proposed in `Performance-Efficiency Trade-offs in Unsupervised Pre-training
+for Speech Recognition <https://arxiv.org/abs/2109.06870>`__ by Felix Wu, Kwangyoun Kim, Jing Pan, Kyu Han, Kilian Q.
+Weinberger, Yoav Artzi.
+
+The abstract from the paper is the following:
+
+*This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition
+(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance
+and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a
+pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a
+variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x
+inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference
+time, SEW reduces word error rate by 25-50% across different model sizes.*
+
+Tips:
+
+- SEW is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+- SEWForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded using
+  :class:`~transformers.Wav2Vec2CTCTokenizer`.
+
+This model was contributed by `anton-l <https://huggingface.co/anton-l>`__.
+
+
+SEWConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SEWConfig
+    :members:
+
+
+SEWModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SEWModel
+    :members: forward
+
+
+SEWForCTC
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SEWForCTC
+    :members: forward
+
+
+SEWForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SEWForSequenceClassification
+    :members: forward
--- a/docs/source/model_doc/sew_d.rst
+++ b/docs/source/model_doc/sew_d.rst
@@ -0,0 +1,66 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+SEW-D
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+SEW-D (Squeezed and Efficient Wav2Vec with Disentangled attention) was proposed in `Performance-Efficiency Trade-offs
+in Unsupervised Pre-training for Speech Recognition <https://arxiv.org/abs/2109.06870>`__ by Felix Wu, Kwangyoun Kim,
+Jing Pan, Kyu Han, Kilian Q. Weinberger, Yoav Artzi.
+
+The abstract from the paper is the following:
+
+*This paper is a study of performance-efficiency trade-offs in pre-trained models for automatic speech recognition
+(ASR). We focus on wav2vec 2.0, and formalize several architecture designs that influence both the model performance
+and its efficiency. Putting together all our observations, we introduce SEW (Squeezed and Efficient Wav2vec), a
+pre-trained model architecture with significant improvements along both performance and efficiency dimensions across a
+variety of training setups. For example, under the 100h-960h semi-supervised setup on LibriSpeech, SEW achieves a 1.9x
+inference speedup compared to wav2vec 2.0, with a 13.5% relative reduction in word error rate. With a similar inference
+time, SEW reduces word error rate by 25-50% across different model sizes.*
+
+Tips:
+
+- SEW-D is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+- SEWDForCTC is fine-tuned using connectionist temporal classification (CTC) so the model output has to be decoded
+  using :class:`~transformers.Wav2Vec2CTCTokenizer`.
+
+This model was contributed by `anton-l <https://huggingface.co/anton-l>`__.
+
+
+SEWDConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SEWDConfig
+    :members:
+
+
+SEWDModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SEWDModel
+    :members: forward
+
+
+SEWDForCTC
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SEWDForCTC
+    :members: forward
+
+SEWDForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SEWDForSequenceClassification
+    :members: forward
--- a/docs/source/model_doc/speech_to_text.rst
+++ b/docs/source/model_doc/speech_to_text.rst
@@ -42,8 +42,8 @@ features. The :class:`~transformers.Speech2TextProcessor` wraps :class:`~transfo
 predicted token ids.

 The feature extractor depends on :obj:`torchaudio` and the tokenizer depends on :obj:`sentencepiece` so be sure to
-install those packages before running the examples. You could either install those as extra speech dependancies with
-``pip install transformers"[speech, sentencepiece]"`` or install the packages seperatly with ``pip install torchaudio
+install those packages before running the examples. You could either install those as extra speech dependencies with
+``pip install transformers"[speech, sentencepiece]"`` or install the packages seperately with ``pip install torchaudio
 sentencepiece``. Also ``torchaudio`` requires the development version of the `libsndfile
 <http://www.mega-nerd.com/libsndfile/>`__ package which can be installed via a system package manager. On Ubuntu it can
 be installed as follows: ``apt install libsndfile1-dev``
@@ -66,7 +66,7 @@ be installed as follows: ``apt install libsndfile1-dev``
        ...     batch["speech"] = speech
        ...     return batch

-        >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
@@ -98,7 +98,7 @@ be installed as follows: ``apt install libsndfile1-dev``
        ...     batch["speech"] = speech
        ...     return batch

-        >>> ds = load_dataset("patrickvonplaten/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
--- a/docs/source/model_doc/speech_to_text_2.rst
+++ b/docs/source/model_doc/speech_to_text_2.rst
@@ -0,0 +1,123 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Speech2Text2
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Speech2Text2 model is used together with :doc:`Wav2Vec2 <wav2vec2>` for Speech Translation models proposed in
+`Large-Scale Self- and Semi-Supervised Learning for Speech Translation <https://arxiv.org/abs/2104.06678>`__ by
+Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli, Alexis Conneau.
+
+Speech2Text2 is a *decoder-only* transformer model that can be used with any speech *encoder-only*, such as
+:doc:`Wav2Vec2 <wav2vec2>` or :doc:`HuBERT <hubert>` for Speech-to-Text tasks. Please refer to the
+:doc:`SpeechEncoderDecoder <speechencoderdecoder>` class on how to combine Speech2Text2 with any speech *encoder-only*
+model.
+
+This model was contributed by `Patrick von Platen <https://huggingface.co/patrickvonplaten>`__.
+
+The original code can be found `here
+<https://github.com/pytorch/fairseq/blob/1f7ef9ed1e1061f8c7f88f8b94c7186834398690/fairseq/models/wav2vec/wav2vec2_asr.py#L266>`__.
+
+
+Tips:
+
+- Speech2Text2 achieves state-of-the-art results on the CoVoST Speech Translation dataset. For more information, see
+  the `official models <https://huggingface.co/models?other=speech2text2>`__ .
+- Speech2Text2 is always used within the :doc:`SpeechEncoderDecoder <speechencoderdecoder>` framework.
+- Speech2Text2's tokenizer currently only supports inference, but not training.
+
+Inference
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Speech2Text2's :class:`~transformers.SpeechEncoderDecoderModel` model accepts raw waveform input values from speech and
+makes use of :func:`~transformers.generation_utils.GenerationMixin.generate` to translate the input speech
+autoregressively to the target language.
+
+The :class:`~transformers.Wav2Vec2FeatureExtractor` class is responsible for preprocessing the input speech and
+:class:`~transformers.Speech2Text2Tokenizer` decodes the generated target tokens to the target string. The
+:class:`~transformers.Speech2Text2Processor` wraps :class:`~transformers.Wav2Vec2FeatureExtractor` and
+:class:`~transformers.Speech2Text2Tokenizer` into a single instance to both extract the input features and decode the
+predicted token ids.
+
+- Step-by-step Speech Translation
+
+.. code-block::
+
+        >>> import torch
+        >>> from transformers import Speech2Text2Processor, SpeechEncoderDecoderModel
+        >>> from datasets import load_dataset
+        >>> import soundfile as sf
+
+        >>> model = SpeechEncoderDecoderModel.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
+        >>> processor = Speech2Text2Processor.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
+
+        >>> def map_to_array(batch):
+        ...     speech, _ = sf.read(batch["file"])
+        ...     batch["speech"] = speech
+        ...     return batch
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> ds = ds.map(map_to_array)
+
+        >>> inputs = processor(ds["speech"][0], sampling_rate=16_000, return_tensors="pt")
+        >>> generated_ids = model.generate(input_ids=inputs["input_values"], attention_mask=inputs["attention_mask"])
+
+        >>> transcription = processor.batch_decode(generated_ids)
+
+
+- Speech Translation via Pipelines
+
+    The automatic speech recognition pipeline can also be used to translate speech in just a couple lines of code
+
+.. code-block::
+
+        >>> from datasets import load_dataset
+        >>> from transformers import pipeline
+
+        >>> librispeech_en = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> asr = pipeline("automatic-speech-recognition", model="facebook/s2t-wav2vec2-large-en-de", feature_extractor="facebook/s2t-wav2vec2-large-en-de")
+
+        >>> translation_de = asr(librispeech_en[0]["file"])
+
+
+See `model hub <https://huggingface.co/models?filter=speech2text2>`__ to look for Speech2Text2 checkpoints.
+
+
+Speech2Text2Config
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Speech2Text2Config
+    :members:
+
+
+Speech2TextTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Speech2Text2Tokenizer
+    :members: batch_decode, decode, save_vocabulary
+
+
+Speech2Text2Processor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Speech2Text2Processor
+    :members: __call__, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
+
+
+Speech2Text2ForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Speech2Text2ForCausalLM
+    :members: forward
--- a/docs/source/model_doc/speechencoderdecoder.rst
+++ b/docs/source/model_doc/speechencoderdecoder.rst
@@ -0,0 +1,40 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Speech Encoder Decoder Models
+-----------------------------------------------------------------------------------------------------------------------
+
+The :class:`~transformers.SpeechEncoderDecoderModel` can be used to initialize a speech-sequence-to-text-sequence model
+with any pretrained speech autoencoding model as the encoder (*e.g.* :doc:`Wav2Vec2 <wav2vec2>`, :doc:`Hubert
+<hubert>`) and any pretrained autoregressive model as the decoder.
+
+The effectiveness of initializing speech-sequence-to-text-sequence models with pretrained checkpoints for speech
+recognition and speech translation has *e.g.* been shown in `Large-Scale Self- and Semi-Supervised Learning for Speech
+Translation <https://arxiv.org/abs/2104.06678>`__ by Changhan Wang, Anne Wu, Juan Pino, Alexei Baevski, Michael Auli,
+Alexis Conneau.
+
+An example of how to use a :class:`~transformers.SpeechEncoderDecoderModel` for inference can be seen in
+:doc:`Speech2Text2 <speech_to_text_2>`.
+
+
+SpeechEncoderDecoderConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SpeechEncoderDecoderConfig
+    :members:
+
+
+SpeechEncoderDecoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SpeechEncoderDecoderModel
+    :members: forward, from_encoder_decoder_pretrained
--- a/docs/source/model_doc/splinter.rst
+++ b/docs/source/model_doc/splinter.rst
@@ -0,0 +1,87 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Splinter
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Splinter model was proposed in `Few-Shot Question Answering by Pretraining Span Selection
+<https://arxiv.org/abs/2101.00438>`__ by Ori Ram, Yuval Kirstain, Jonathan Berant, Amir Globerson, Omer Levy. Splinter
+is an encoder-only transformer (similar to BERT) pretrained using the recurring span selection task on a large corpus
+comprising Wikipedia and the Toronto Book Corpus.
+
+The abstract from the paper is the following:
+
+In several question answering benchmarks, pretrained models have reached human parity through fine-tuning on an order
+of 100,000 annotated questions and answers. We explore the more realistic few-shot setting, where only a few hundred
+training examples are available, and observe that standard models perform poorly, highlighting the discrepancy between
+current pretraining objectives and question answering. We propose a new pretraining scheme tailored for question
+answering: recurring span selection. Given a passage with multiple sets of recurring spans, we mask in each set all
+recurring spans but one, and ask the model to select the correct span in the passage for each masked span. Masked spans
+are replaced with a special token, viewed as a question representation, that is later used during fine-tuning to select
+the answer span. The resulting model obtains surprisingly good results on multiple benchmarks (e.g., 72.7 F1 on SQuAD
+with only 128 training examples), while maintaining competitive performance in the high-resource setting.
+
+Tips:
+
+- Splinter was trained to predict answers spans conditioned on a special [QUESTION] token. These tokens contextualize
+  to question representations which are used to predict the answers. This layer is called QASS, and is the default
+  behaviour in the :class:`~transformers.SplinterForQuestionAnswering` class. Therefore:
+- Use :class:`~transformers.SplinterTokenizer` (rather than :class:`~transformers.BertTokenizer`), as it already
+  contains this special token. Also, its default behavior is to use this token when two sequences are given (for
+  example, in the `run_qa.py` script).
+- If you plan on using Splinter outside `run_qa.py`, please keep in mind the question token - it might be important for
+  the success of your model, especially in a few-shot setting.
+- Please note there are two different checkpoints for each size of Splinter. Both are basically the same, except that
+  one also has the pretrained wights of the QASS layer (`tau/splinter-base-qass` and `tau/splinter-large-qass`) and one
+  doesn't (`tau/splinter-base` and `tau/splinter-large`). This is done to support randomly initializing this layer at
+  fine-tuning, as it is shown to yield better results for some cases in the paper.
+
+This model was contributed by `yuvalkirstain <https://huggingface.co/yuvalkirstain>`__ and `oriram
+<https://huggingface.co/oriram>`__. The original code can be found `here <https://github.com/oriram/splinter>`__.
+
+SplinterConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterConfig
+    :members:
+
+
+SplinterTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+SplinterTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterTokenizerFast
+    :members:
+
+
+SplinterModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterModel
+    :members: forward
+
+
+SplinterForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SplinterForQuestionAnswering
+    :members: forward
--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -13,9 +13,6 @@
 T5
 -----------------------------------------------------------------------------------------------------------------------

-**DISCLAIMER:** This model is still a work in progress, if you see something strange, file a `Github Issue
-<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__.
-
 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -42,28 +39,56 @@ Tips:
  different prefix to the input corresponding to each task, e.g., for translation: *translate English to German: ...*,
  for summarization: *summarize: ...*.

-  For more information about which prefix to use, it is easiest to look into Appendix D of the `paper
-  <https://arxiv.org/pdf/1910.10683.pdf>`__. - For sequence-to-sequence generation, it is recommended to use
-  :meth:`~transformers.generation_utils.GenerationMixin.generate`. This method takes care of feeding the encoded input
-  via cross-attention layers to the decoder and auto-regressively generates the decoder output. - T5 uses relative
-  scalar embeddings. Encoder input padding can be done on the left and on the right.
+- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
+
+- See the :ref:`training`, :ref:`inference` and :ref:`scripts` sections below for all details regarding usage.
+
+T5 comes in different sizes:
+
+- `t5-small <https://huggingface.co/t5-small>`__
+
+- `t5-base <https://huggingface.co/t5-base>`__
+
+- `t5-large <https://huggingface.co/t5-large>`__
+
+- `t5-3b <https://huggingface.co/t5-3b>`__
+
+- `t5-11b <https://huggingface.co/t5-11b>`__.
+
+Based on the original T5 model, Google has released some follow-up works:
+
+- **T5v1.1**: T5v1.1 is an improved version of T5 with some architectural tweaks, and is pre-trained on C4 only without
+  mixing in the supervised tasks. Refer to the documentation of T5v1.1 which can be found :doc:`here <t5v1.1>`.
+
+- **mT5**: mT5 is a multilingual T5 model. It is pre-trained on the mC4 corpus, which includes 101 languages. Refer to
+  the documentation of mT5 which can be found :doc:`here <mt5>`.
+
+- **byT5**: byT5 is a T5 model pre-trained on byte sequences rather than SentencePiece subword token sequences. Refer
+  to the documentation of byT5 which can be found :doc:`here <byt5>`.
+
+All checkpoints can be found on the `hub <https://huggingface.co/models?search=t5>`__.

 This model was contributed by `thomwolf <https://huggingface.co/thomwolf>`__. The original code can be found `here
 <https://github.com/google-research/text-to-text-transfer-transformer>`__.

+.. _training:
+
 Training
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher
-forcing. This means that for training we always need an input sequence and a target sequence. The input sequence is fed
-to the model using :obj:`input_ids`. The target sequence is shifted to the right, i.e., prepended by a start-sequence
-token and fed to the decoder using the :obj:`decoder_input_ids`. In teacher-forcing style, the target sequence is then
-appended by the EOS token and corresponds to the :obj:`labels`. The PAD token is hereby used as the start-sequence
-token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
+forcing. This means that for training, we always need an input sequence and a corresponding target sequence. The input
+sequence is fed to the model using :obj:`input_ids`. The target sequence is shifted to the right, i.e., prepended by a
+start-sequence token and fed to the decoder using the :obj:`decoder_input_ids`. In teacher-forcing style, the target
+sequence is then appended by the EOS token and corresponds to the :obj:`labels`. The PAD token is hereby used as the
+start-sequence token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
+
+One can use :class:`~transformers.T5ForConditionalGeneration` (or the Tensorflow/Flax variant), which includes the
+language modeling head on top of the decoder.

 - Unsupervised denoising training

-  In this setup spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and
+  In this setup, spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and
  the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. Each
  sentinel token represents a unique mask token for this sentence and should start with :obj:`<extra_id_0>`,
  :obj:`<extra_id_1>`, ... up to :obj:`<extra_id_99>`. As a default, 100 sentinel tokens are available in
@@ -72,34 +97,201 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash
  For instance, the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be
  processed as follows:

-.. code-block::
+    .. code-block::

-    from transformers import T5ForConditionalGeneration, T5Tokenizer
-    model = T5ForConditionalGeneration.from_pretrained("t5-small")
-    tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        from transformers import T5Tokenizer, T5ForConditionalGeneration

-    input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
-    labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
-    # the forward function automatically creates the correct decoder_input_ids
-    loss = model(input_ids=input_ids, labels=labels).loss
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
+        labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
+        # the forward function automatically creates the correct decoder_input_ids
+        loss = model(input_ids=input_ids, labels=labels).loss
+
+  If you're interested in pre-training T5 on a new corpus, check out the `run_t5_mlm_flax.py
+  <https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling>`__ script in the Examples
+  directory.

 - Supervised training

-  In this setup the input sequence and output sequence are standard sequence-to-sequence input output mapping. In
-  translation, for instance with the input sequence "The house is wonderful." and output sequence "Das Haus ist
-  wunderbar.", the sentences should be processed as follows:
+  In this setup, the input sequence and output sequence are a standard sequence-to-sequence input-output mapping.
+  Suppose that we want to fine-tune the model for translation for example, and we have a training example: the input
+  sequence "The house is wonderful." and output sequence "Das Haus ist wunderbar.", then they should be prepared for
+  the model as follows:
+
+    .. code-block::
+
+        from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+        labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
+        # the forward function automatically creates the correct decoder_input_ids
+        loss = model(input_ids=input_ids, labels=labels).loss
+
+  As you can see, only 2 inputs are required for the model in order to compute a loss: :obj:`input_ids` (which are the
+  :obj:`input_ids` of the encoded input sequence) and :obj:`labels` (which are the :obj:`input_ids` of the encoded
+  target sequence). The model will automatically create the :obj:`decoder_input_ids` based on the :obj:`labels`, by
+  shifting them one position to the right and prepending the :obj:`config.decoder_start_token_id`, which for T5 is
+  equal to 0 (i.e. the id of the pad token). Also note the task prefix: we prepend the input sequence with 'translate
+  English to German: ' before encoding it. This will help in improving the performance, as this task prefix was used
+  during T5's pre-training.
+
+  However, the example above only shows a single training example. In practice, one trains deep learning models in
+  batches. This entails that we must pad/truncate examples to the same length. For encoder-decoder models, one
+  typically defines a :obj:`max_source_length` and :obj:`max_target_length`, which determine the maximum length of the
+  input and output sequences respectively (otherwise they are truncated). These should be carefully set depending on
+  the task.
+
+  In addition, we must make sure that padding token id's of the :obj:`labels` are not taken into account by the loss
+  function. In PyTorch and Tensorflow, this can be done by replacing them with -100, which is the :obj:`ignore_index`
+  of the :obj:`CrossEntropyLoss`. In Flax, one can use the :obj:`decoder_attention_mask` to ignore padded tokens from
+  the loss (see the `Flax summarization script
+  <https://github.com/huggingface/transformers/tree/master/examples/flax/summarization>`__ for details). We also pass
+  :obj:`attention_mask` as additional input to the model, which makes sure that padding tokens of the inputs are
+  ignored. The code example below illustrates all of this.
+
+    .. code-block::
+
+        from transformers import T5Tokenizer, T5ForConditionalGeneration 
+        import torch
+
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        # the following 2 hyperparameters are task-specific
+        max_source_length = 512
+        max_target_length = 128
+
+        # Suppose we have the following 2 training examples:
+        input_sequence_1 = "Welcome to NYC"
+        output_sequence_1 = "Bienvenue à NYC"
+
+        input_sequence_2 = "HuggingFace is a company"
+        output_sequence_2 = "HuggingFace est une entreprise"
+
+        # encode the inputs
+        task_prefix = "translate English to French: "
+        input_sequences = [input_sequence_1, input_sequence_2]
+        encoding = tokenizer([task_prefix + sequence for sequence in input_sequences], 
+                             padding='longest', 
+                             max_length=max_source_length, 
+                             truncation=True, 
+                             return_tensors="pt")
+        input_ids, attention_mask = encoding.input_ids, encoding.attention_mask
+
+        # encode the targets
+        target_encoding = tokenizer([output_sequence_1, output_sequence_2], 
+                                    padding='longest', 
+                                    max_length=max_target_length, 
+                                    truncation=True)
+        labels = target_encoding.input_ids
+
+        # replace padding token id's of the labels by -100
+        labels = [
+                   [(label if label != tokenizer.pad_token_id else -100) for label in labels_example] for labels_example in labels
+        ] 
+        labels = torch.tensor(labels)
+
+        # forward pass
+        loss = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels).loss
+
+Additional training tips:
+
+- T5 models need a slightly higher learning rate than the default one set in the :obj:`Trainer` when using the AdamW
+  optimizer. Typically, 1e-4 and 3e-4 work well for most problems (classification, summarization, translation, question
+  answering, question generation). Note that T5 was pre-trained using the AdaFactor optimizer.
+
+- According to `this forum post <https://discuss.huggingface.co/t/t5-finetuning-tips/684>`__, task prefixes matter when
+  (1) doing multi-task training (2) your task is similar or related to one of the supervised tasks used in T5's
+  pre-training mixture (see Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`__ for the task prefixes
+  used).
+
+- If training on TPU, it is recommended to pad all examples of the dataset to the same length or make use of
+  `pad_to_multiple_of` to have a small number of predefined bucket sizes to fit all examples in. Dynamically padding
+  batches to the longest example is not recommended on TPU as it triggers a recompilation for every batch shape that is
+  encountered during training thus significantly slowing down the training. only padding up to the longest example in a
+  batch) leads to very slow training on TPU.
+
+.. _inference:
+
+Inference
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+At inference time, it is recommended to use :meth:`~transformers.generation_utils.GenerationMixin.generate`. This
+method takes care of encoding the input and feeding the encoded hidden states via cross-attention layers to the decoder
+and auto-regressively generates the decoder output. Check out `this blog post
+<https://huggingface.co/blog/how-to-generate>`__ to know all the details about generating text with Transformers.
+There's also `this blog post <https://huggingface.co/blog/encoder-decoder#encoder-decoder>`__ which explains how
+generation works in general in encoder-decoder models.

 .. code-block::

-    from transformers import T5ForConditionalGeneration, T5Tokenizer
-    model = T5ForConditionalGeneration.from_pretrained("t5-small")
-    tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        from transformers import T5Tokenizer, T5ForConditionalGeneration 

-    input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
-    labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
-    # the forward function automatically creates the correct decoder_input_ids
-    loss = model(input_ids=input_ids, labels=labels).loss
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")

+        input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+        outputs = model.generate(input_ids)
+        print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+        # Das Haus ist wunderbar.
+
+Note that T5 uses the :obj:`pad_token_id` as the :obj:`decoder_start_token_id`, so when doing generation without using
+:meth:`~transformers.generation_utils.GenerationMixin.generate`, make sure you start it with the :obj:`pad_token_id`.
+
+The example above only shows a single example. You can also do batched inference, like so:
+
+.. code-block::
+
+        from transformers import T5Tokenizer, T5ForConditionalGeneration
+
+        tokenizer = T5Tokenizer.from_pretrained("t5-small")
+        model = T5ForConditionalGeneration.from_pretrained("t5-small")
+
+        # when generating, we will use the logits of right-most token to predict the next token
+        # so the padding should be on the left
+        tokenizer.padding_side = "left" 
+        tokenizer.pad_token = tokenizer.eos_token # to avoid an error
+
+        task_prefix = 'translate English to German: '
+        sentences = ['The house is wonderful.', 'I like to work in NYC.'] # use different length sentences to test batching
+        inputs = tokenizer([task_prefix + sentence for sentence in sentences], return_tensors="pt", padding=True)
+
+        output_sequences = model.generate(
+            input_ids=inputs['input_ids'],
+            attention_mask=inputs['attention_mask'],
+            do_sample=False, # disable sampling to test if batching affects output
+        )
+
+        print(tokenizer.batch_decode(output_sequences, skip_special_tokens=True))
+
+        # ['Das Haus ist wunderbar.', 'Ich arbeite gerne in NYC.']
+
+.. _scripts:
+
+Example scripts
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+T5 is supported by several example scripts, both for pre-training and fine-tuning.
+
+* pre-training: the `run_t5_mlm_flax.py
+  <https://github.com/huggingface/transformers/blob/master/examples/flax/language-modeling/run_t5_mlm_flax.py>`__
+  script allows you to further pre-train T5 or pre-train T5 from scratch on your own data. The `t5_tokenizer_model.py
+  <https://github.com/huggingface/transformers/blob/master/examples/flax/language-modeling/t5_tokenizer_model.py>`__
+  script allows you to further train a T5 tokenizer or train a T5 Tokenizer from scratch on your own data. Note that
+  Flax (a neural network library on top of JAX) is particularly useful to train on TPU hardware.
+
+* fine-tuning: T5 is supported by the official summarization scripts (`PyTorch
+  <https://github.com/huggingface/transformers/tree/master/examples/pytorch/summarization>`__, `Tensorflow
+  <https://github.com/huggingface/transformers/tree/master/examples/tensorflow/summarization>`__, and `Flax
+  <https://github.com/huggingface/transformers/tree/master/examples/flax/summarization>`__) and translation scripts
+  (`PyTorch <https://github.com/huggingface/transformers/tree/master/examples/pytorch/translation>`__ and `Tensorflow
+  <https://github.com/huggingface/transformers/tree/master/examples/tensorflow/translation>`__). These scripts allow
+  you to easily fine-tune T5 on custom data for summarization/translation.

 T5Config
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/t5v1.1.rst
+++ b/docs/source/model_doc/t5v1.1.rst
@@ -0,0 +1,66 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+T5v1.1
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+T5v1.1 was released in the `google-research/text-to-text-transfer-transformer
+<https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__
+repository by Colin Raffel et al. It's an improved version of the original T5 model.
+
+One can directly plug in the weights of T5v1.1 into a T5 model, like so:
+
+.. code-block::
+
+    from transformers import T5ForConditionalGeneration
+
+    model = T5ForConditionalGeneration.from_pretrained('google/t5-v1_1-base') 
+
+T5 Version 1.1 includes the following improvements compared to the original T5 model:
+
+- GEGLU activation in the feed-forward hidden layer, rather than ReLU. See `this paper
+  <https://arxiv.org/abs/2002.05202>`__.
+
+- Dropout was turned off in pre-training (quality win). Dropout should be re-enabled during fine-tuning.
+
+- Pre-trained on C4 only without mixing in the downstream tasks.
+
+- No parameter sharing between the embedding and classifier layer.
+
+- "xl" and "xxl" replace "3B" and "11B". The model shapes are a bit different - larger :obj:`d_model` and smaller
+  :obj:`num_heads` and :obj:`d_ff`.
+
+Note: T5 Version 1.1 was only pre-trained on `C4 <https://huggingface.co/datasets/c4>`__ excluding any supervised
+training. Therefore, this model has to be fine-tuned before it is useable on a downstream task, unlike the original T5
+model. Since t5v1.1 was pre-trained unsupervisedly, there's no real advantage to using a task prefix during single-task
+fine-tuning. If you are doing multi-task fine-tuning, you should use a prefix.
+
+Google has released the following variants:
+
+- `google/t5-v1_1-small <https://huggingface.co/google/t5-v1_1-small>`__
+
+- `google/t5-v1_1-base <https://huggingface.co/google/t5-v1_1-base>`__
+
+- `google/t5-v1_1-large <https://huggingface.co/google/t5-v1_1-large>`__
+
+- `google/t5-v1_1-xl <https://huggingface.co/google/t5-v1_1-xl>`__
+
+- `google/t5-v1_1-xxl <https://huggingface.co/google/t5-v1_1-xxl>`__.
+
+One can refer to :doc:`T5's documentation page <t5>` for all tips, code examples and notebooks.
+
+This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The original code can be
+found `here
+<https://github.com/google-research/text-to-text-transfer-transformer/blob/main/released_checkpoints.md#t511>`__.
--- a/docs/source/model_doc/trocr.rst
+++ b/docs/source/model_doc/trocr.rst
@@ -0,0 +1,95 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+TrOCR
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The TrOCR model was proposed in `TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models
+<https://arxiv.org/abs/2109.10282>`__ by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang,
+Zhoujun Li, Furu Wei. TrOCR consists of an image Transformer encoder and an autoregressive text Transformer decoder to
+perform `optical character recognition (OCR) <https://en.wikipedia.org/wiki/Optical_character_recognition>`__.
+
+Please refer to the :doc:`VisionEncoderDecoder <visionencoderdecoder>` class on how to use this model.
+
+This model was contributed by `Niels Rogge <https://huggingface.co/nielsr>`__.
+
+The original code can be found `here
+<https://github.com/microsoft/unilm/tree/6f60612e7cc86a2a1ae85c47231507a587ab4e01/trocr>`__.
+
+
+Tips:
+
+- TrOCR is pre-trained in 2 stages before being fine-tuned on downstream datasets. It achieves state-of-the-art results
+  on both printed (e.g. the `SROIE dataset <https://paperswithcode.com/dataset/sroie>`__) and handwritten (e.g. the
+  `IAM Handwriting dataset <https://fki.tic.heia-fr.ch/databases/iam-handwriting-database>`__) text recognition tasks.
+  For more information, see the `official models <https://huggingface.co/models?other=trocr>`__.
+- TrOCR is always used within the :doc:`VisionEncoderDecoder <visionencoderdecoder>` framework.
+
+Inference
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+TrOCR's :class:`~transformers.VisionEncoderDecoderModel` model accepts images as input and makes use of
+:func:`~transformers.generation_utils.GenerationMixin.generate` to autoregressively generate text given the input
+image.
+
+The :class:`~transformers.ViTFeatureExtractor` class is responsible for preprocessing the input image and
+:class:`~transformers.RobertaTokenizer` decodes the generated target tokens to the target string. The
+:class:`~transformers.TrOCRProcessor` wraps :class:`~transformers.ViTFeatureExtractor` and
+:class:`~transformers.RobertaTokenizer` into a single instance to both extract the input features and decode the
+predicted token ids.
+
+- Step-by-step Optical Character Recognition (OCR)
+
+.. code-block::
+
+        >>> from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
+        >>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
+
+        >>> # load image from the IAM dataset
+        >>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+        >>> pixel_values = processor(image, return_tensors="pt").pixel_values
+        >>> generated_ids = model.generate(pixel_values)
+
+        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+
+
+See the `model hub <https://huggingface.co/models?filter=trocr>`__ to look for TrOCR checkpoints.
+
+
+TrOCRConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrOCRConfig
+    :members:
+
+
+TrOCRProcessor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrOCRProcessor
+    :members: __call__, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor
+
+
+TrOCRForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrOCRForCausalLM
+    :members: forward
--- a/docs/source/model_doc/unispeech.rst
+++ b/docs/source/model_doc/unispeech.rst
@@ -0,0 +1,88 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+UniSpeech
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The UniSpeech model was proposed in `UniSpeech: Unified Speech Representation Learning with Labeled and Unlabeled Data
+<https://arxiv.org/abs/2101.07597>`__ by Chengyi Wang, Yu Wu, Yao Qian, Kenichi Kumatani, Shujie Liu, Furu Wei, Michael
+Zeng, Xuedong Huang .
+
+The abstract from the paper is the following:
+
+*In this paper, we propose a unified pre-training approach called UniSpeech to learn speech representations with both
+unlabeled and labeled data, in which supervised phonetic CTC learning and phonetically-aware contrastive
+self-supervised learning are conducted in a multi-task learning manner. The resultant representations can capture
+information more correlated with phonetic structures and improve the generalization across languages and domains. We
+evaluate the effectiveness of UniSpeech for cross-lingual representation learning on public CommonVoice corpus. The
+results show that UniSpeech outperforms self-supervised pretraining and supervised transfer learning for speech
+recognition by a maximum of 13.4% and 17.8% relative phone error rate reductions respectively (averaged over all
+testing languages). The transferability of UniSpeech is also demonstrated on a domain-shift speech recognition task,
+i.e., a relative word error rate reduction of 6% against the previous approach.*
+
+Tips:
+
+- UniSpeech is a speech model that accepts a float array corresponding to the raw waveform of the speech signal. Please
+  use :class:`~transformers.Wav2Vec2Processor` for the feature extraction.
+- UniSpeech model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be
+  decoded using :class:`~transformers.Wav2Vec2CTCTokenizer`.
+
+This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The Authors' code can be
+found `here <https://github.com/microsoft/UniSpeech/tree/main/UniSpeech>`__.
+
+
+UniSpeechConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.UniSpeechConfig
+    :members:
+
+
+UniSpeech specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.unispeech.modeling_unispeech.UniSpeechBaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.unispeech.modeling_unispeech.UniSpeechForPreTrainingOutput
+    :members: 
+
+
+UniSpeechModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.UniSpeechModel
+    :members: forward
+
+
+UniSpeechForCTC
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.UniSpeechForCTC
+    :members: forward
+
+
+UniSpeechForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.UniSpeechForSequenceClassification
+    :members: forward
+
+
+UniSpeechForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.UniSpeechForPreTraining
+    :members: forward
--- a/docs/source/model_doc/unispeech_sat.rst
+++ b/docs/source/model_doc/unispeech_sat.rst
@@ -0,0 +1,92 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+UniSpeech-SAT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The UniSpeech-SAT model was proposed in `UniSpeech-SAT: Universal Speech Representation Learning with Speaker Aware
+Pre-Training <https://arxiv.org/abs/2110.05752>`__ by Sanyuan Chen, Yu Wu, Chengyi Wang, Zhengyang Chen, Zhuo Chen,
+Shujie Liu, Jian Wu, Yao Qian, Furu Wei, Jinyu Li, Xiangzhan Yu .
+
+The abstract from the paper is the following:
+
+*Self-supervised learning (SSL) is a long-standing goal for speech processing, since it utilizes large-scale unlabeled
+data and avoids extensive human labeling. Recent years witness great successes in applying self-supervised learning in
+speech recognition, while limited exploration was attempted in applying SSL for modeling speaker characteristics. In
+this paper, we aim to improve the existing SSL framework for speaker representation learning. Two methods are
+introduced for enhancing the unsupervised speaker information extraction. First, we apply the multi-task learning to
+the current SSL framework, where we integrate the utterance-wise contrastive loss with the SSL objective function.
+Second, for better speaker discrimination, we propose an utterance mixing strategy for data augmentation, where
+additional overlapped utterances are created unsupervisely and incorporate during training. We integrate the proposed
+methods into the HuBERT framework. Experiment results on SUPERB benchmark show that the proposed system achieves
+state-of-the-art performance in universal representation learning, especially for speaker identification oriented
+tasks. An ablation study is performed verifying the efficacy of each proposed method. Finally, we scale up training
+dataset to 94 thousand hours public audio data and achieve further performance improvement in all SUPERB tasks.*
+
+Tips:
+
+- UniSpeechSat is a speech model that accepts a float array corresponding to the raw waveform of the speech signal.
+  Please use :class:`~transformers.Wav2Vec2Processor` for the feature extraction.
+- UniSpeechSat model can be fine-tuned using connectionist temporal classification (CTC) so the model output has to be
+  decoded using :class:`~transformers.Wav2Vec2CTCTokenizer`.
+- UniSpeechSat performs especially well on speaker verification, speaker identification, and speaker diarization tasks.
+
+This model was contributed by `patrickvonplaten <https://huggingface.co/patrickvonplaten>`__. The Authors' code can be
+found `here <https://github.com/microsoft/UniSpeech/tree/main/UniSpeech-SAT>`__.
+
+
+UniSpeechSatConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.UniSpeechSatConfig
+    :members:
+
+
+UniSpeechSat specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.unispeech_sat.modeling_unispeech_sat.UniSpeechSatBaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.unispeech_sat.modeling_unispeech_sat.UniSpeechSatForPreTrainingOutput
+    :members: 
+
+
+UniSpeechSatModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.UniSpeechSatModel
+    :members: forward
+
+
+UniSpeechSatForCTC
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.UniSpeechSatForCTC
+    :members: forward
+
+
+UniSpeechSatForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.UniSpeechSatForSequenceClassification
+    :members: forward
+
+
+UniSpeechSatForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.UniSpeechSatForPreTraining
+    :members: forward
--- a/docs/source/model_doc/visionencoderdecoder.rst
+++ b/docs/source/model_doc/visionencoderdecoder.rst
@@ -0,0 +1,41 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Vision Encoder Decoder Models
+-----------------------------------------------------------------------------------------------------------------------
+
+The :class:`~transformers.VisionEncoderDecoderModel` can be used to initialize an image-to-text-sequence model with any
+pretrained vision autoencoding model as the encoder (*e.g.* :doc:`ViT <vit>`, :doc:`BEiT <beit>`, :doc:`DeiT <deit>`)
+and any pretrained language model as the decoder (*e.g.* :doc:`RoBERTa <roberta>`, :doc:`GPT2 <gpt2>`, :doc:`BERT
+<bert>`).
+
+The effectiveness of initializing image-to-text-sequence models with pretrained checkpoints has been shown in (for
+example) `TrOCR: Transformer-based Optical Character Recognition with Pre-trained Models
+<https://arxiv.org/abs/2109.10282>`__ by Minghao Li, Tengchao Lv, Lei Cui, Yijuan Lu, Dinei Florencio, Cha Zhang,
+Zhoujun Li, Furu Wei.
+
+An example of how to use a :class:`~transformers.VisionEncoderDecoderModel` for inference can be seen in :doc:`TrOCR
+<trocr>`.
+
+
+VisionEncoderDecoderConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.VisionEncoderDecoderConfig
+    :members:
+
+
+VisionEncoderDecoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.VisionEncoderDecoderModel
+    :members: forward, from_encoder_decoder_pretrained
--- a/docs/source/model_doc/visual_bert.rst
+++ b/docs/source/model_doc/visual_bert.rst
@@ -58,9 +58,17 @@ layer, and is expected to be bound by [CLS] and a [SEP] tokens, as in BERT. The
 appropriately for the textual and visual parts.

 The :class:`~transformers.BertTokenizer` is used to encode the text. A custom detector/feature extractor must be used
-to get the visual embeddings. For an example on how to generate visual embeddings, see the `colab notebook
-<https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__. The following example shows
-how to get the last hidden state using :class:`~transformers.VisualBertModel`:
+to get the visual embeddings. The following example notebooks show how to use VisualBERT with Detectron-like models:
+
+* `VisualBERT VQA demo notebook
+  <https://github.com/huggingface/transformers/tree/master/examples/research_projects/visual_bert>`__ : This notebook
+  contains an example on VisualBERT VQA.
+
+* `Generate Embeddings for VisualBERT (Colab Notebook)
+  <https://colab.research.google.com/drive/1bLGxKdldwqnMVA5x4neY7-l_8fKGWQYI?usp=sharing>`__ : This notebook contains
+  an example on how to generate visual embeddings.
+
+The following example shows how to get the last hidden state using :class:`~transformers.VisualBertModel`:

 .. code-block::

@@ -74,6 +82,13 @@ how to get the last hidden state using :class:`~transformers.VisualBertModel`:
        >>> # this is a custom function that returns the visual embeddings given the image path
        >>> visual_embeds = get_visual_embeddings(image_path)

+        >>> visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
+        >>> visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
+        >>> inputs.update({
+        ...     "visual_embeds": visual_embeds,
+        ...     "visual_token_type_ids": visual_token_type_ids,
+        ...     "visual_attention_mask": visual_attention_mask
+        ... })
        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state

--- a/docs/source/model_doc/vit.rst
+++ b/docs/source/model_doc/vit.rst
@@ -66,6 +66,23 @@ Tips:
  language modeling). With this approach, the smaller ViT-B/16 model achieves 79.9% accuracy on ImageNet, a significant
  improvement of 2% to training from scratch, but still 4% behind supervised pre-training.

+Following the original Vision Transformer, some follow-up works have been made:
+
+- DeiT (Data-efficient Image Transformers) by Facebook AI. DeiT models are distilled vision transformers. Refer to
+  :doc:`DeiT's documentation page <deit>`. The authors of DeiT also released more efficiently trained ViT models, which
+  you can directly plug into :class:`~transformers.ViTModel` or :class:`~transformers.ViTForImageClassification`. There
+  are 4 variants available (in 3 different sizes): `facebook/deit-tiny-patch16-224`, `facebook/deit-small-patch16-224`,
+  `facebook/deit-base-patch16-224` and `facebook/deit-base-patch16-384`. Note that one should use
+  :class:`~transformers.DeiTFeatureExtractor` in order to prepare images for the model.
+
+- BEiT (BERT pre-training of Image Transformers) by Microsoft Research. BEiT models outperform supervised pre-trained
+  vision transformers using a self-supervised method inspired by BERT (masked image modeling) and based on a VQ-VAE.
+  Refer to :doc:`BEiT's documentation page <beit>`.
+
+- DINO (a method for self-supervised training of Vision Transformers) by Facebook AI. Vision Transformers trained using
+  the DINO method show very interesting properties not seen with convolutional models. They are capable of segmenting
+  objects, without having ever been trained to do so. DINO checkpoints can be found on the `hub
+  <https://huggingface.co/models?other=dino>`__.

 This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code (written in JAX) can be
 found `here <https://github.com/google-research/vision_transformer>`__.
--- a/docs/source/model_doc/wav2vec2.rst
+++ b/docs/source/model_doc/wav2vec2.rst
@@ -67,6 +67,22 @@ Wav2Vec2Processor
    :members: __call__, pad, from_pretrained, save_pretrained, batch_decode, decode, as_target_processor


+Wav2Vec2 specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2BaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput
+    :members: 
+
+.. autoclass:: transformers.models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2BaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.wav2vec2.modeling_flax_wav2vec2.FlaxWav2Vec2ForPreTrainingOutput
+    :members: 
+
+
 Wav2Vec2Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -80,6 +96,14 @@ Wav2Vec2ForCTC
 .. autoclass:: transformers.Wav2Vec2ForCTC
    :members: forward

+
+Wav2Vec2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Wav2Vec2ForSequenceClassification
+    :members: forward
+
+
 Wav2Vec2ForPreTraining
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_sharing.rst
+++ b/docs/source/model_sharing.rst
@@ -76,7 +76,7 @@ Transformers:
 It will store your access token in the Hugging Face cache folder (by default :obj:`~/.cache/`).

 If you don't have an easy access to a terminal (for instance in a Colab session), you can find a token linked to your
-acount by going on `huggingface.co <https://huggingface.co/>`, click on your avatar on the top left corner, then on
+account by going on `huggingface.co <https://huggingface.co/>`, click on your avatar on the top left corner, then on
 `Edit profile` on the left, just beneath your profile picture. In the submenu `API Tokens`, you will find your API
 token that you can just copy.

@@ -90,7 +90,7 @@ Directly push your model to the hub
   picture-in-picture" allowfullscreen></iframe>

 Once you have an API token (either stored in the cache or copied and pasted in your notebook), you can directly push a
-finetuned model you saved in :obj:`save_drectory` by calling:
+finetuned model you saved in :obj:`save_directory` by calling:

 .. code-block:: python

@@ -341,8 +341,8 @@ Add a model card

 To make sure everyone knows what your model can do, what its limitations, potential bias or ethical considerations are,
 please add a README.md model card to your model repo. You can just create it, or there's also a convenient button
-titled "Add a README.md" on your model page. A model card template can be found `here
-<https://github.com/huggingface/model_card>`__ (meta-suggestions are welcome). model card template (meta-suggestions
+titled "Add a README.md" on your model page. A model card documentation can be found `here
+<https://huggingface.co/docs/hub/model-repos>`__ (meta-suggestions are welcome). model card template (meta-suggestions
 are welcome).

 .. note::
--- a/docs/source/parallelism.md
+++ b/docs/source/parallelism.md
@@ -35,7 +35,7 @@ The following is the brief description of the main concepts that will be describ
 1. DataParallel (DP) - the same setup is replicated multiple times, and each being fed a slice of the data. The processing is done in parallel and all setups are synchronized at the end of each training step.
 2. TensorParallel (TP) - each tensor is split up into multiple chunks, so instead of having the whole tensor reside on a single gpu, each shard of the tensor resides on its designated gpu. During processing each shard gets processed separately and in parallel on different GPUs and the results are synced at the end of the step. This is what one may call horizontal parallelism, as the splitting happens on horizontal level.
 3. PipelineParallel (PP) - the model is split up vertically (layer-level) across multiple GPUs, so that only one or several layers of the model are places on a single gpu. Each gpu processes in parallel different stages of the pipeline and working on a small chunk of the batch.
-4. Zero Redundancy Optimizer (ZeRO) - Also performs sharding of the tensors somewhat similar to TP, except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model does't need to be modified. It also supports various offloading techniques to compensate for limited GPU memory.
+4. Zero Redundancy Optimizer (ZeRO) - Also performs sharding of the tensors somewhat similar to TP, except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model doesn't need to be modified. It also supports various offloading techniques to compensate for limited GPU memory.
 5. Sharded DDP - is another name for the foundational ZeRO concept as used by various other implementations of ZeRO.


@@ -58,7 +58,7 @@ a0 | b0 | c0
 a1 | b1 | c1
 a2 | b2 | c2
 ```
-Layer La has weights a0, at and a2.
+Layer La has weights a0, a1 and a2.

 If we have 3 GPUs, the Sharded DDP (= Zero-DP) splits the model onto 3 GPUs like so:

@@ -110,7 +110,7 @@ To me this sounds like an efficient group backpacking weight distribution strate
 2. person B carries the stove
 3. person C carries the axe

-Now each night they all share what they have with others and get from others what the don't have, and in the morning they pack up their allocated type of gear and continue on their way. This is Sharded DDP / Zero DP.
+Now each night they all share what they have with others and get from others what they don't have, and in the morning they pack up their allocated type of gear and continue on their way. This is Sharded DDP / Zero DP.

 Compare this strategy to the simple one where each person has to carry their own tent, stove and axe, which would be far more inefficient. This is DataParallel (DP and DDP) in Pytorch.

@@ -140,7 +140,7 @@ we just sliced it in 2 vertically, placing layers 0-3 onto GPU0 and 4-7 to GPU1.

 Now while data travels from layer 0 to 1, 1 to 2 and 2 to 3 this is just the normal model. But when data needs to pass from layer 3 to layer 4 it needs to travel from GPU0 to GPU1 which introduces a communication overhead. If the participating GPUs are on the same compute node (e.g. same physical machine) this copying is pretty fast, but if the GPUs are located on different compute nodes (e.g. multiple machines) the communication overhead could be significantly larger.

-Then layers 4 to 5 to 6 to 7 are as a normal model would have and when the 7th layer completes we often need to send the data back to layer 0 where the labels are (or alternatively send the labels to the the last layer). Now the loss can be computed and the optimizer can do its work.
+Then layers 4 to 5 to 6 to 7 are as a normal model would have and when the 7th layer completes we often need to send the data back to layer 0 where the labels are (or alternatively send the labels to the last layer). Now the loss can be computed and the optimizer can do its work.

 Problems:
 - the main deficiency and why this one is called "naive" MP, is that all but one GPU is idle at any given moment. So if 4 GPUs are used, it's almost identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware. Plus there is the overhead of copying the data between devices. So 4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, except the latter will complete the training faster, since it doesn't have the data copying overhead. But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states)
@@ -220,9 +220,12 @@ Special considerations: TP requires very fast network, and therefore it's not ad
 This section is based on the original much more [detailed TP overview](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530).
 by [@anton-l](https://github.com/anton-l).

-Implementations:
+Alternative names:
 - DeepSpeed calls it [tensor slicing](https://www.deepspeed.ai/features/#model-parallelism)
- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation.
+
+Implementations:
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation, as it's very model-specific
+- [parallelformers](https://github.com/tunib-ai/parallelformers) (only inference at the moment)

 🤗 Transformers status:
 - core: not yet implemented in the core
@@ -269,7 +272,7 @@ Implementations:

 One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. It has already been discussed in [ZeRO Data Parallel](#zero-data-parallel). Normally it's a standalone feature that doesn't require PP or TP. But it can be combined with PP and TP.

-When ZeRO-DP is combined with PP (and optinally TP) it typically enables only ZeRO stage 1 (optimizer sharding).
+When ZeRO-DP is combined with PP (and optionally TP) it typically enables only ZeRO stage 1 (optimizer sharding).

 While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have bad performance impacts. There would need to be an additional reduce-scatter collective for every micro-batch to aggregate the gradients before sharding, which adds a potentially significant communication overhead. By nature of Pipeline Parallelism, small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with minimizing the Pipeline bubble (number of micro-batches). Therefore those communication costs are going to hurt.

@@ -293,12 +296,27 @@ Paper: ["Beyond Data and Model Parallelism for Deep Neural Networks" by Zhihao J

 It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter.

-1. Sample = Data Parallelism
-2. Operator = part vertical Layer Parallelism, but it can split the layer too - more refined level
-3. Attribute = horizontal Model Parallelism (Megatron-LM style)
-4. Parameter = Sharded model params
+1. Sample = Data Parallelism (sample-wise parallel)
+2. Operator = Parallelize a single operation into several sub-operations
+3. Attribute = Data Parallelism (length-wise parallel)
+4. Parameter = Model Parallelism (regardless of dimension - horizontal or vertical)

-and they are working on Pipeline Parallelism. I guess ZeRO-DP is Sample+Parameter in this context.
+Examples:
+* Sample
+
+Let's take 10 batches of sequence length 512. If we parallelize them by sample dimension into 2 devices, we get 10 x 512 which becomes be 5 x 2 x 512.
+
+* Operator
+
+If we perform layer normalization, we compute std first and mean second, and then we can normalize data. Operator parallelism allows computing std and mean in parallel. So if we parallelize them by operator dimension into 2 devices (cuda:0, cuda:1), first we copy input data into both devices, and cuda:0 computes std, cuda:1 computes mean at the same time.
+
+* Attribute
+
+We have 10 batches of 512 length. If we parallelize them by attribute dimension into 2 devices, 10 x 512 will be 10 x 2 x 256.
+
+* Parameter
+
+It is similar with tensor model parallelism or naive layer-wise model parallelism.

 ![flex-flow-soap](imgs/parallelism-flexflow.jpeg)

@@ -313,7 +331,7 @@ So the promise is very attractive - it runs a 30min simulation on the cluster of

 ## Which Strategy To Use When

-Here is a very rough outlook at which parallelism strategy to use when. The first on the list is typically faster.
+Here is a very rough outline at which parallelism strategy to use when. The first on each list is typically faster.

 **⇨ Single GPU**

@@ -324,7 +342,11 @@ Here is a very rough outlook at which parallelism strategy to use when. The firs
 * Model doesn't fit onto a single GPU:

    1. ZeRO + Offload CPU and optionally NVMe
+    2. as above plus Memory Centric Tiling (see below for details) if the largest layer can't fit into a single GPU

+* Largest Layer not fitting into a single GPU:
+
+1. ZeRO - Enable [Memory Centric Tiling](https://deepspeed.readthedocs.io/en/latest/zero3.html#memory-centric-tiling) (MCT). It allows you to run arbitrarily large layers by automatically splitting them and executing them sequentially. MCT reduces the number of parameters that are live on a GPU, but it does not affect the activation memory. As this need is very rare as of this writing a manual override of `torch.nn.Linear` needs to be done by the user.

 **⇨ Single Node / Multi-GPU**

@@ -339,7 +361,14 @@ Here is a very rough outlook at which parallelism strategy to use when. The firs
    2. ZeRO
    3. TP

-    With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP and ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup.
+    With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP or ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup.
+
+    TP is almost always used within a single node. That is TP size <= gpus per node.
+
+* Largest Layer not fitting into a single GPU:
+
+    1. If not using ZeRO - must use TP, as PP alone won't be able to fit.
+    2. With ZeRO see the same entry for "Single GPU" above


 **⇨ Multi-Node / Multi-GPU**
--- a/docs/source/performance.md
+++ b/docs/source/performance.md
@@ -53,6 +53,7 @@ Software:
 - Tensor Parallelism
 - Low-memory Optimizers
 - fp16/bf16 (smaller data)
+- Gradient checkpointing



@@ -226,6 +227,21 @@ pytorch `autocast` which performs AMP include a caching feature, which speed thi

 Autocast maintains a cache of the FP16 casts of model params (leaves). This helps streamline parameter reuse: if the same FP32 param is used in several different FP16list ops, like several matmuls, instead of re-casting the param to FP16 on entering each matmul, the cast will occur on the first matmul, the casted FP16 copy will be cached, and for all later matmuls the FP16 copy will be reused. The cache is maintained only within a particular outermost autocast context. When you exit the autocast context the cache is dropped. For recommended usage, in which autocast wraps the forward pass, and then you exit the context before calling backward(), this means the cache only lasts the duration of the forward pass each iteration, and will be rebuilt next iteration. (The cache of FP16-casted copies MUST be rebuilt each iteration. The FP32 params get updated by the optimizer, so the FP16 copies must be recreated, otherwise the FP16 values will be stale.)

+
+### Gradient Checkpointing
+
+One way to use significantly less GPU memory is to enabled "Gradient Checkpointing" (also known as "activation checkpointing"). When enabled, a lot of memory can be freed at the cost of small decrease in the training speed due to recomputing parts of the graph during back-propagation.
+
+This technique was first shared in the paper: [Training Deep Nets with Sublinear Memory Cost](https://arxiv.org/abs/1604.06174). The paper will also give you the exact details on the savings, but it's in the ballpark of `O(sqrt(n))`, where `n` is the number of feed-forward layers.
+
+To activate this feature in 🤗 Transformers for models that support it, use:
+
+```python
+model.gradient_checkpointing_enable()
+```
+or add `--gradient_checkpointing` to the Trainer arguments.
+
+
 ### Batch sizes

 One gets the most efficient performance when batch sizes and input/output neuron counts are divisible by a certain number, which typically starts at 8, but can be much higher as well. That number varies a lot depending on the specific hardware being used and the dtype of the model.
--- a/docs/source/perplexity.rst
+++ b/docs/source/perplexity.rst
@@ -96,11 +96,11 @@ dataset in memory.

 .. code-block:: python

-    from nlp import load_dataset
+    from datasets import load_dataset
    test = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
    encodings = tokenizer('\n\n'.join(test['text']), return_tensors='pt')

-With 🤗 Transformers, we can simply pass the ``input_ids`` as the ``labels`` to our model, and the average
+With 🤗 Transformers, we can simply pass the ``input_ids`` as the ``labels`` to our model, and the average negative
 log-likelihood for each token is returned as the loss. With our sliding window approach, however, there is overlap in
 the tokens we pass to the model at each iteration. We don't want the log-likelihood for the tokens we're just treating
 as context to be included in our loss, so we can set these targets to ``-100`` so that they are ignored. The following
@@ -110,10 +110,13 @@ available to condition on).

 .. code-block:: python

+    import torch
+    from tqdm import tqdm
+
    max_length = model.config.n_positions
    stride = 512

-    lls = []
+    nlls = []
    for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
        begin_loc = max(i + stride - max_length, 0)
        end_loc = min(i + stride, encodings.input_ids.size(1))
@@ -124,11 +127,11 @@ available to condition on).

        with torch.no_grad():
            outputs = model(input_ids, labels=target_ids)
-            log_likelihood = outputs[0] * trg_len
+            neg_log_likelihood = outputs[0] * trg_len

-        lls.append(log_likelihood)
+        nlls.append(neg_log_likelihood)

-    ppl = torch.exp(torch.stack(lls).sum() / end_loc)
+    ppl = torch.exp(torch.stack(nlls).sum() / end_loc)

 Running this with the stride length equal to the max input length is equivalent to the suboptimal, non-sliding-window
 strategy we discussed above. The smaller the stride, the more context the model will have in making each prediction,
--- a/docs/source/preprocessing.rst
+++ b/docs/source/preprocessing.rst
@@ -228,7 +228,7 @@ Everything you always wanted to know about padding and truncation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 We have seen the commands that will work for most cases (pad your batch to the length of the maximum sentence and
-truncate to the maximum length the mode can accept). However, the API supports more strategies if you need them. The
+truncate to the maximum length the model can accept). However, the API supports more strategies if you need them. The
 three arguments you need to know for this are :obj:`padding`, :obj:`truncation` and :obj:`max_length`.

 - :obj:`padding` controls the padding. It can be a boolean or a string which should be:
@@ -243,15 +243,16 @@ three arguments you need to know for this are :obj:`padding`, :obj:`truncation`

 - :obj:`truncation` controls the truncation. It can be a boolean or a string which should be:

-    - :obj:`True` or :obj:`'only_first'` truncate to a maximum length specified by the :obj:`max_length` argument or
+    - :obj:`True` or :obj:`'longest_first'` truncate to a maximum length specified by the :obj:`max_length` argument or
      the maximum length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will
-      only truncate the first sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.
+      truncate token by token, removing a token from the longest sequence in the pair until the proper length is
+      reached.
    - :obj:`'only_second'` truncate to a maximum length specified by the :obj:`max_length` argument or the maximum
      length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will only truncate
      the second sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.
-    - :obj:`'longest_first'` truncate to a maximum length specified by the :obj:`max_length` argument or the maximum
-      length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will truncate token
-      by token, removing a token from the longest sequence in the pair until the proper length is reached.
+    - :obj:`'only_first'` truncate to a maximum length specified by the :obj:`max_length` argument or the maximum
+      length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will only truncate
+      the first sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.
    - :obj:`False` or :obj:`'do_not_truncate'` to not truncate the sequences. As we have seen before, this is the
      default behavior.

--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -202,7 +202,7 @@ For the full list, refer to `https://huggingface.co/models <https://huggingface.
 |                    | ``distilroberta-base``                                     | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
 |                    |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
 |                    |                                                            |                                                                                                                                       |
-|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation>`__)                   |
 |                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                    | ``roberta-base-openai-detector``                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
 |                    |                                                            | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                             |
@@ -217,37 +217,37 @@ For the full list, refer to `https://huggingface.co/models <https://huggingface.
 | DistilBERT         | ``distilbert-base-uncased``                                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                    |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint                                                   |
 |                    |                                                            |                                                                                                                                       |
-|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation>`__)                   |
 |                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                    | ``distilbert-base-uncased-distilled-squad``                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                    |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
 |                    |                                                            |                                                                                                                                       |
-|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation>`__)                   |
 |                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                    | ``distilbert-base-cased``                                  | | 6-layer, 768-hidden, 12-heads, 65M parameters                                                                                       |
 |                    |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint                                                     |
 |                    |                                                            |                                                                                                                                       |
-|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation>`__)                   |
 |                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                    | ``distilbert-base-cased-distilled-squad``                  | | 6-layer, 768-hidden, 12-heads, 65M parameters                                                                                       |
 |                    |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint, with an additional question answering layer.       |
 |                    |                                                            |                                                                                                                                       |
-|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation>`__)                   |
 |                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                    | ``distilgpt2``                                             | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
 |                    |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
 |                    |                                                            |                                                                                                                                       |
-|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation>`__)                   |
 |                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                    | ``distilbert-base-german-cased``                           | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                    |                                                            | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint.                   |
 |                    |                                                            |                                                                                                                                       |
-|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation>`__)                   |
 |                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                    | ``distilbert-base-multilingual-cased``                     | | 6-layer, 768-hidden, 12-heads, 134M parameters                                                                                      |
 |                    |                                                            | | The multilingual DistilBERT model distilled from the Multilingual BERT model `bert-base-multilingual-cased` checkpoint.             |
 |                    |                                                            |                                                                                                                                       |
-|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/research_projects/distillation>`__)                   |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | CTRL               | ``ctrl``                                                   | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters                                                                                    |
 |                    |                                                            | | Salesforce's Large-sized CTRL English model                                                                                         |
--- a/docs/source/quicktour.rst
+++ b/docs/source/quicktour.rst
@@ -65,10 +65,10 @@ make them readable. For instance:
 .. code-block::

    >>> classifier('We are very happy to show you the 🤗 Transformers library.')
-    [{'label': 'POSITIVE', 'score': 0.9997795224189758}]
+    [{'label': 'POSITIVE', 'score': 0.9998}]

-That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model as a
-`batch`, returning a list of dictionaries like this one:
+That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model, returning
+a list of dictionaries like this one:

 .. code-block::

@@ -79,6 +79,8 @@ That's encouraging! You can use it on a list of sentences, which will be preproc
    label: POSITIVE, with score: 0.9998
    label: NEGATIVE, with score: 0.5309

+To use with a large dataset, look at :doc:`iterating over a pipeline <./main_classes/pipelines>`
+
 You can see the second sentence has been classified as negative (it needs to be positive or negative) but its score is
 fairly neutral.

@@ -195,7 +197,8 @@ sequence:
 .. code-block::

    >>> print(inputs)
-    {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+    {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102],
+     'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

 You can pass a list of sentences directly to your tokenizer. If your goal is to send them through your model as a
 batch, you probably want to pad them all to the same length, truncate them to the maximum length the model can accept
@@ -260,12 +263,12 @@ objects are described in greater detail :doc:`here <main_classes/output>`. For n
    >>> ## PYTORCH CODE
    >>> print(pt_outputs)
    SequenceClassifierOutput(loss=None, logits=tensor([[-4.0833,  4.3364],
-        [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
+            [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
    >>> ## TENSORFLOW CODE
    >>> print(tf_outputs)
    TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
-    array([[-4.0832963 ,  4.3364143 ],
-           [ 0.081807  , -0.04178282]], dtype=float32)>, hidden_states=None, attentions=None)
+    array([[-4.0833 ,  4.3364  ],
+           [ 0.0818, -0.0418]], dtype=float32)>, hidden_states=None, attentions=None)

 Notice how the output object has a ``logits`` attribute. You can use this to access the model's final activations.

@@ -283,7 +286,7 @@ Let's apply the SoftMax activation to get predictions.
    >>> pt_predictions = nn.functional.softmax(pt_outputs.logits, dim=-1)
    >>> ## TENSORFLOW CODE
    >>> import tensorflow as tf
-    >>> tf.nn.softmax(tf_outputs.logits, axis=-1)
+    >>> tf_predictions = tf.nn.softmax(tf_outputs.logits, axis=-1)

 We can see we get the numbers from before:

@@ -292,8 +295,8 @@ We can see we get the numbers from before:
    >>> ## TENSORFLOW CODE
    >>> print(tf_predictions)
    tf.Tensor(
-    [[2.2042994e-04 9.9977952e-01]
-     [5.3086340e-01 4.6913657e-01]], shape=(2, 2), dtype=float32)
+    [[2.2043e-04 9.9978e-01]
+     [5.3086e-01 4.6914e-01]], shape=(2, 2), dtype=float32)
    >>> ## PYTORCH CODE
    >>> print(pt_predictions)
    tensor([[2.2043e-04, 9.9978e-01],
@@ -309,14 +312,14 @@ attribute:
    >>> pt_outputs = pt_model(**pt_batch, labels = torch.tensor([1, 0]))
    >>> print(pt_outputs)
    SequenceClassifierOutput(loss=tensor(0.3167, grad_fn=<NllLossBackward>), logits=tensor([[-4.0833,  4.3364],
-    [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
+            [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>), hidden_states=None, attentions=None)
    >>> ## TENSORFLOW CODE
    >>> import tensorflow as tf
    >>> tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0]))
    >>> print(tf_outputs)
-    TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.2051287e-04, 6.3326043e-01], dtype=float32)>, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
-    array([[-4.0832963 ,  4.3364143 ],
-           [ 0.081807  , -0.04178282]], dtype=float32)>, hidden_states=None, attentions=None)
+    TFSequenceClassifierOutput(loss=<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2.2051e-04, 6.3326e-01], dtype=float32)>, logits=<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
+    array([[-4.0833 ,  4.3364  ],
+           [ 0.0818, -0.0418]], dtype=float32)>, hidden_states=None, attentions=None)

 Models are standard `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ or `tf.keras.Model
 <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ so you can use them in your usual training loop. 🤗
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -37,11 +37,19 @@ architectures, and are made to be easily extendable to other architectures.

 Ready-made configurations include the following models:

+..
+    This table is automatically generated by make style, do not fill manually!
+
 - ALBERT
 - BART
 - BERT
+- CamemBERT
 - DistilBERT
- GPT-2
+- GPT Neo
+- LayoutLM
+- Longformer
+- mBART
+- OpenAI GPT-2
 - RoBERTa
 - T5
 - XLM-RoBERTa
@@ -99,6 +107,30 @@ It will be exported under ``onnx/bert-base-cased``. You should see similar logs:
                    -[✓] all values close (atol: 0.0001)
    All good, model saved at: onnx/bert-base-cased/model.onnx

+This export can now be used in the ONNX inference runtime:
+
+.. code-block::
+
+    import onnxruntime as ort
+
+    from transformers import BertTokenizerFast
+    tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
+
+    ort_session = ort.InferenceSession("onnx/bert-base-cased/model.onnx")
+
+    inputs = tokenizer("Using BERT in ONNX!", return_tensors="np")
+    outputs = ort_session.run(["last_hidden_state", "pooler_output"], dict(inputs))
+
+The outputs used (:obj:`["last_hidden_state", "pooler_output"]`) can be obtained by taking a look at the ONNX
+configuration of each model. For example, for BERT:
+
+.. code-block::
+
+    from transformers.models.bert import BertOnnxConfig, BertConfig
+
+    config = BertConfig()
+    onnx_config = BertOnnxConfig(config)
+    output_keys = list(onnx_config.outputs.keys())

 Implementing a custom configuration for an unsupported architecture
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -142,6 +174,12 @@ An important fact to notice is the use of `OrderedDict` in both inputs and outpu
 as inputs are matched against their relative position within the `PreTrainedModel.forward()` prototype and outputs are
 match against there position in the returned `BaseModelOutputX` instance.

+An example of such an addition is visible here, for the MBart model: `Making MBART ONNX-convertible
+<https://github.com/huggingface/transformers/pull/13049/commits/d097adcebd89a520f04352eb215a85916934204f>`__
+
+If you would like to contribute your addition to the library, we recommend you implement tests. An example of such
+tests is visible here: `Adding tests to the MBART ONNX conversion
+<https://github.com/huggingface/transformers/pull/13049/commits/5d642f65abf45ceeb72bd855ca7bfe2506a58e6a>`__

 Graph conversion
 -----------------------------------------------------------------------------------------------------------------------
--- a/docs/source/task_summary.rst
+++ b/docs/source/task_summary.rst
@@ -107,7 +107,8 @@ each other. The process is the following:
    >>> sequence_1 = "Apples are especially bad for your health"
    >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

-    >>> # The tokekenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to the sequence, as well as compute the attention masks.
+    >>> # The tokenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to
+    >>> # the sequence, as well as compute the attention masks.
    >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
    >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")

@@ -141,12 +142,13 @@ each other. The process is the following:
    >>> sequence_1 = "Apples are especially bad for your health"
    >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"

-    >>> # The tokekenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to the sequence, as well as compute the attention masks.
+    >>> # The tokenizer will automatically add any model specific separators (i.e. <CLS> and <SEP>) and tokens to
+    >>> # the sequence, as well as compute the attention masks.
    >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="tf")
    >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf")

-    >>> paraphrase_classification_logits = model(paraphrase)[0]
-    >>> not_paraphrase_classification_logits = model(not_paraphrase)[0]
+    >>> paraphrase_classification_logits = model(paraphrase).logits
+    >>> not_paraphrase_classification_logits = model(not_paraphrase).logits

    >>> paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
    >>> not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
@@ -197,11 +199,11 @@ positions of the extracted answer in the text.

    >>> result = question_answerer(question="What is extractive question answering?", context=context)
    >>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
-    Answer: 'the task of extracting an answer from a text given a question.', score: 0.6226, start: 34, end: 96
+    Answer: 'the task of extracting an answer from a text given a question', score: 0.6177, start: 34, end: 95

    >>> result = question_answerer(question="What is a good example of a question answering dataset?", context=context)
    >>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
-    Answer: 'SQuAD dataset,', score: 0.5053, start: 147, end: 161
+    Answer: 'SQuAD dataset', score: 0.5152, start: 147, end: 160


 Here is an example of question answering using a model and a tokenizer. The process is the following:
@@ -247,10 +249,10 @@ Here is an example of question answering using a model and a tokenizer. The proc
    ...     answer_start_scores = outputs.start_logits
    ...     answer_end_scores = outputs.end_logits
    ...
-    ...     answer_start = torch.argmax(
-    ...         answer_start_scores
-    ...     )  # Get the most likely beginning of answer with the argmax of the score
-    ...     answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
+    ...     # Get the most likely beginning of answer with the argmax of the score
+    ...     answer_start = torch.argmax(answer_start_scores)
+    ...     # Get the most likely end of answer with the argmax of the score 
+    ...     answer_end = torch.argmax(answer_end_scores) + 1
    ...
    ...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    ...
@@ -261,7 +263,7 @@ Here is an example of question answering using a model and a tokenizer. The proc
    Question: What does 🤗 Transformers provide?
    Answer: general - purpose architectures
    Question: 🤗 Transformers provides interoperability between which frameworks?
-    Answer: tensorflow 2 . 0 and pytorch
+    Answer: tensorflow 2. 0 and pytorch
    >>> ## TENSORFLOW CODE
    >>> from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
    >>> import tensorflow as tf
@@ -290,12 +292,11 @@ Here is an example of question answering using a model and a tokenizer. The proc
    ...     answer_start_scores = outputs.start_logits
    ...     answer_end_scores = outputs.end_logits
    ...
-    ...     answer_start = tf.argmax(
-    ...         answer_start_scores, axis=1
-    ...     ).numpy()[0]  # Get the most likely beginning of answer with the argmax of the score
-    ...     answer_end = (
-    ...         tf.argmax(answer_end_scores, axis=1) + 1
-    ...     ).numpy()[0]  # Get the most likely end of answer with the argmax of the score
+    ...     # Get the most likely beginning of answer with the argmax of the score
+    ...     answer_start = tf.argmax(answer_start_scores, axis=1).numpy()[0]
+    ...     # Get the most likely end of answer with the argmax of the score
+    ...     answer_end = tf.argmax(answer_end_scores, axis=1).numpy()[0] + 1
+    ...
    ...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
    ...
    ...     print(f"Question: {question}")
@@ -305,7 +306,7 @@ Here is an example of question answering using a model and a tokenizer. The proc
    Question: What does 🤗 Transformers provide?
    Answer: general - purpose architectures
    Question: 🤗 Transformers provides interoperability between which frameworks?
-    Answer: tensorflow 2 . 0 and pytorch
+    Answer: tensorflow 2. 0 and pytorch



@@ -344,31 +345,31 @@ This outputs the sequences with the mask filled, the confidence score, and the t

    >>> from pprint import pprint
    >>> pprint(unmasker(f"HuggingFace is creating a {unmasker.tokenizer.mask_token} that the community uses to solve NLP tasks."))
-    [{'score': 0.1792745739221573,
-      'sequence': '<s>HuggingFace is creating a tool that the community uses to '
-                  'solve NLP tasks.</s>',
+    [{'score': 0.1793,
+      'sequence': 'HuggingFace is creating a tool that the community uses to solve '
+                  'NLP tasks.',
      'token': 3944,
-      'token_str': 'Ġtool'},
-     {'score': 0.11349421739578247,
-      'sequence': '<s>HuggingFace is creating a framework that the community uses '
-                  'to solve NLP tasks.</s>',
+      'token_str': ' tool'},
+     {'score': 0.1135,
+      'sequence': 'HuggingFace is creating a framework that the community uses to '
+                  'solve NLP tasks.',
      'token': 7208,
-      'token_str': 'Ġframework'},
-     {'score': 0.05243554711341858,
-      'sequence': '<s>HuggingFace is creating a library that the community uses to '
-                  'solve NLP tasks.</s>',
+      'token_str': ' framework'},
+     {'score': 0.0524,
+      'sequence': 'HuggingFace is creating a library that the community uses to '
+                  'solve NLP tasks.',
      'token': 5560,
-      'token_str': 'Ġlibrary'},
-     {'score': 0.03493533283472061,
-      'sequence': '<s>HuggingFace is creating a database that the community uses '
-                  'to solve NLP tasks.</s>',
+      'token_str': ' library'},
+     {'score': 0.0349,
+      'sequence': 'HuggingFace is creating a database that the community uses to '
+                  'solve NLP tasks.',
      'token': 8503,
-      'token_str': 'Ġdatabase'},
-     {'score': 0.02860250137746334,
-      'sequence': '<s>HuggingFace is creating a prototype that the community uses '
-                  'to solve NLP tasks.</s>',
+      'token_str': ' database'},
+     {'score': 0.0286,
+      'sequence': 'HuggingFace is creating a prototype that the community uses to '
+                  'solve NLP tasks.',
      'token': 17715,
-      'token_str': 'Ġprototype'}]
+      'token_str': ' prototype'}]

 Here is an example of doing masked language modeling using a model and a tokenizer. The process is the following:

@@ -385,42 +386,22 @@ Here is an example of doing masked language modeling using a model and a tokeniz
 .. code-block::

    >>> ## PYTORCH CODE
-    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> from transformers import AutoModelForMaskedLM, AutoTokenizer
    >>> import torch

    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-    >>> model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+    >>> model = AutoModelForMaskedLM.from_pretrained("distilbert-base-cased")

-    >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+    >>> sequence = "Distilled models are smaller than the models they mimic. Using them instead of the large " \
+    ...     f"versions would help {tokenizer.mask_token} our carbon footprint."

-    >>> input = tokenizer.encode(sequence, return_tensors="pt")
-    >>> mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
+    >>> inputs = tokenizer(sequence, return_tensors="pt")
+    >>> mask_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)[1]

-    >>> token_logits = model(input).logits
+    >>> token_logits = model(**inputs).logits
    >>> mask_token_logits = token_logits[0, mask_token_index, :]

    >>> top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
-    >>> ## TENSORFLOW CODE
-    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
-    >>> import tensorflow as tf
-
-    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-    >>> model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")
-
-    >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
-
-    >>> input = tokenizer.encode(sequence, return_tensors="tf")
-    >>> mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
-
-    >>> token_logits = model(input)[0]
-    >>> mask_token_logits = token_logits[0, mask_token_index, :]
-
-    >>> top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
-
-
-This prints five sequences, with the top 5 tokens predicted by the model:
-
-.. code-block::

    >>> for token in top_5_tokens:
    ...     print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
@@ -429,6 +410,34 @@ This prints five sequences, with the top 5 tokens predicted by the model:
    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelForMaskedLM, AutoTokenizer
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    >>> model = TFAutoModelForMaskedLM.from_pretrained("distilbert-base-cased")
+
+    >>> sequence = "Distilled models are smaller than the models they mimic. Using them instead of the large " \
+    ...     f"versions would help {tokenizer.mask_token} our carbon footprint."
+
+    >>> inputs = tokenizer(sequence, return_tensors="tf")
+    >>> mask_token_index = tf.where(inputs["input_ids"] == tokenizer.mask_token_id)[0, 1]
+
+    >>> token_logits = model(**inputs).logits
+    >>> mask_token_logits = token_logits[0, mask_token_index, :]
+
+    >>> top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
+
+    >>> for token in top_5_tokens:
+    ...     print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
+
+
+This prints five sequences, with the top 5 tokens predicted by the model.


 Causal Language Modeling
@@ -449,19 +458,20 @@ of tokens.
 .. code-block::

    >>> ## PYTORCH CODE
-    >>> from transformers import AutoModelWithLMHead, AutoTokenizer, top_k_top_p_filtering
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer, top_k_top_p_filtering
    >>> import torch
    >>> from torch import nn

    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    >>> model = AutoModelWithLMHead.from_pretrained("gpt2")
+    >>> model = AutoModelForCausalLM.from_pretrained("gpt2")

    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and"

-    >>> input_ids = tokenizer.encode(sequence, return_tensors="pt")
+    >>> inputs = tokenizer(sequence, return_tensors="pt")
+    >>> input_ids = inputs["input_ids"]

    >>> # get logits of last hidden state
-    >>> next_token_logits = model(input_ids).logits[:, -1, :]
+    >>> next_token_logits = model(**inputs).logits[:, -1, :]

    >>> # filter
    >>> filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
@@ -473,19 +483,22 @@ of tokens.
    >>> generated = torch.cat([input_ids, next_token], dim=-1)

    >>> resulting_string = tokenizer.decode(generated.tolist()[0])
+    >>> print(resulting_string)
+    Hugging Face is based in DUMBO, New York City, and ...
    >>> ## TENSORFLOW CODE
-    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer, tf_top_k_top_p_filtering
+    >>> from transformers import TFAutoModelForCausalLM, AutoTokenizer, tf_top_k_top_p_filtering
    >>> import tensorflow as tf

    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    >>> model = TFAutoModelWithLMHead.from_pretrained("gpt2")
+    >>> model = TFAutoModelForCausalLM.from_pretrained("gpt2")

-    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and "
+    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and"

-    >>> input_ids = tokenizer.encode(sequence, return_tensors="tf")
+    >>> inputs = tokenizer(sequence, return_tensors="tf")
+    >>> input_ids = inputs["input_ids"]

    >>> # get logits of last hidden state
-    >>> next_token_logits = model(input_ids)[0][:, -1, :]
+    >>> next_token_logits = model(**inputs).logits[:, -1, :]

    >>> # filter
    >>> filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
@@ -496,14 +509,11 @@ of tokens.
    >>> generated = tf.concat([input_ids, next_token], axis=1)

    >>> resulting_string = tokenizer.decode(generated.numpy().tolist()[0])
-
-
-This outputs a (hopefully) coherent next token following the original sequence, which in our case is the word *has*:
-
-.. code-block::
-
    >>> print(resulting_string)
-    Hugging Face is based in DUMBO, New York City, and has
+    Hugging Face is based in DUMBO, New York City, and ...
+
+This outputs a (hopefully) coherent next token following the original sequence, which in our case is the word *is* or
+*features*.

 In the next section, we show how :func:`~transformers.generation_utils.GenerationMixin.generate` can be used to
 generate multiple tokens up to a specified length instead of one token at a time.
@@ -522,7 +532,8 @@ As a default all models apply *Top-K* sampling when used in pipelines, as config

    >>> text_generator = pipeline("text-generation")
    >>> print(text_generator("As far as I am concerned, I will", max_length=50, do_sample=False))
-    [{'generated_text': 'As far as I am concerned, I will be the first to admit that I am not a fan of the idea of a "free market." I think that the idea of a free market is a bit of a stretch. I think that the idea'}]
+    [{'generated_text': 'As far as I am concerned, I will be the first to admit that I am not a fan of the idea of a
+    "free market." I think that the idea of a free market is a bit of a stretch. I think that the idea'}]



@@ -536,9 +547,9 @@ Below is an example of text generation using ``XLNet`` and its tokenizer, which
 .. code-block::

    >>> ## PYTORCH CODE
-    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> from transformers import AutoModelForCausalLM, AutoTokenizer

-    >>> model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased")
+    >>> model = AutoModelForCausalLM.from_pretrained("xlnet-base-cased")
    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
@@ -554,41 +565,42 @@ Below is an example of text generation using ``XLNet`` and its tokenizer, which
    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""

    >>> prompt = "Today the weather is really nice and I am planning on "
-    >>> inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")
+    >>> inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")["input_ids"]

-    >>> prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    >>> prompt_length = len(tokenizer.decode(inputs[0]))
    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
-    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
-
-    >>> ## TENSORFLOW CODE
-    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
-
-    >>> model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased")
-    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
-
-    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
-    >>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
-    ... (except for Alexei and Maria) are discovered.
-    ... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
-    ... remainder of the story. 1883 Western Siberia,
-    ... a young Grigori Rasputin is asked by his father and a group of men to perform magic.
-    ... Rasputin has a vision and denounces one of the men as a horse thief. Although his
-    ... father initially slaps him for making such an accusation, Rasputin watches as the
-    ... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-    ... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
-    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
-
-    >>> prompt = "Today the weather is really nice and I am planning on "
-    >>> inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")
-
-    >>> prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
-    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
-    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
-
-.. code-block::
+    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:]

    >>> print(generated)
-    Today the weather is really nice and I am planning on anning on taking a nice...... of a great time!<eop>...............
+    Today the weather is really nice and I am planning ...
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelForCausalLM, AutoTokenizer
+
+    >>> model = TFAutoModelForCausalLM.from_pretrained("xlnet-base-cased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
+
+    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
+    >>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
+    ... (except for Alexei and Maria) are discovered.
+    ... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+    ... remainder of the story. 1883 Western Siberia,
+    ... a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+    ... Rasputin has a vision and denounces one of the men as a horse thief. Although his
+    ... father initially slaps him for making such an accusation, Rasputin watches as the
+    ... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    ... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
+
+    >>> prompt = "Today the weather is really nice and I am planning on "
+    >>> inputs = tokenizer(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")["input_ids"]
+
+    >>> prompt_length = len(tokenizer.decode(inputs[0]))
+    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
+    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length+1:]
+
+    >>> print(generated)
+    Today the weather is really nice and I am planning ...
+

 Text generation is currently possible with *GPT-2*, *OpenAi-GPT*, *CTRL*, *XLNet*, *Transfo-XL* and *Reformer* in
 PyTorch and for most models in Tensorflow as well. As can be seen in the example above *XLNet* and *Transfo-XL* often
@@ -638,21 +650,20 @@ Here are the expected results:

 .. code-block::

-    >>> print(ner_pipe(sequence))
-    [
-        {'word': 'Hu', 'score': 0.9995632767677307, 'entity': 'I-ORG'},
-        {'word': '##gging', 'score': 0.9915938973426819, 'entity': 'I-ORG'},
-        {'word': 'Face', 'score': 0.9982671737670898, 'entity': 'I-ORG'},
-        {'word': 'Inc', 'score': 0.9994403719902039, 'entity': 'I-ORG'},
-        {'word': 'New', 'score': 0.9994346499443054, 'entity': 'I-LOC'},
-        {'word': 'York', 'score': 0.9993270635604858, 'entity': 'I-LOC'},
-        {'word': 'City', 'score': 0.9993864893913269, 'entity': 'I-LOC'},
-        {'word': 'D', 'score': 0.9825621843338013, 'entity': 'I-LOC'},
-        {'word': '##UM', 'score': 0.936983048915863, 'entity': 'I-LOC'},
-        {'word': '##BO', 'score': 0.8987102508544922, 'entity': 'I-LOC'},
-        {'word': 'Manhattan', 'score': 0.9758241176605225, 'entity': 'I-LOC'},
-        {'word': 'Bridge', 'score': 0.990249514579773, 'entity': 'I-LOC'}
-    ]
+    >>> for entity in ner_pipe(sequence):
+    ...     print(entity)
+    {'entity': 'I-ORG', 'score': 0.9996, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}
+    {'entity': 'I-ORG', 'score': 0.9910, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}
+    {'entity': 'I-ORG', 'score': 0.9982, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}
+    {'entity': 'I-ORG', 'score': 0.9995, 'index': 4, 'word': 'Inc', 'start': 13, 'end': 16}
+    {'entity': 'I-LOC', 'score': 0.9994, 'index': 11, 'word': 'New', 'start': 40, 'end': 43}
+    {'entity': 'I-LOC', 'score': 0.9993, 'index': 12, 'word': 'York', 'start': 44, 'end': 48}
+    {'entity': 'I-LOC', 'score': 0.9994, 'index': 13, 'word': 'City', 'start': 49, 'end': 53}
+    {'entity': 'I-LOC', 'score': 0.9863, 'index': 19, 'word': 'D', 'start': 79, 'end': 80}
+    {'entity': 'I-LOC', 'score': 0.9514, 'index': 20, 'word': '##UM', 'start': 80, 'end': 82}
+    {'entity': 'I-LOC', 'score': 0.9337, 'index': 21, 'word': '##BO', 'start': 82, 'end': 84}
+    {'entity': 'I-LOC', 'score': 0.9762, 'index': 28, 'word': 'Manhattan', 'start': 114, 'end': 123}
+    {'entity': 'I-LOC', 'score': 0.9915, 'index': 29, 'word': 'Bridge', 'start': 124, 'end': 130}

 Note how the tokens of the sequence "Hugging Face" have been identified as an organisation, and "New York City",
 "DUMBO" and "Manhattan Bridge" have been identified as locations.
@@ -679,26 +690,13 @@ Here is an example of doing named entity recognition, using a model and a tokeni
    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

-    >>> label_list = [
-    ...     "O",       # Outside of a named entity
-    ...     "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
-    ...     "I-MISC",  # Miscellaneous entity
-    ...     "B-PER",   # Beginning of a person's name right after another person's name
-    ...     "I-PER",   # Person's name
-    ...     "B-ORG",   # Beginning of an organisation right after another organisation
-    ...     "I-ORG",   # Organisation
-    ...     "B-LOC",   # Beginning of a location right after another location
-    ...     "I-LOC"    # Location
-    ... ]
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, " \
+    ...            "therefore very close to the Manhattan Bridge."

-    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-    ...            "close to the Manhattan Bridge."
+    >>> inputs = tokenizer(sequence, return_tensors="pt")
+    >>> tokens = inputs.tokens()

-    >>> # Bit of a hack to get the tokens with the special tokens
-    >>> tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
-    >>> inputs = tokenizer.encode(sequence, return_tensors="pt")
-
-    >>> outputs = model(inputs).logits
+    >>> outputs = model(**inputs).logits
    >>> predictions = torch.argmax(outputs, dim=2)
    >>> ## TENSORFLOW CODE
    >>> from transformers import TFAutoModelForTokenClassification, AutoTokenizer
@@ -707,14 +705,13 @@ Here is an example of doing named entity recognition, using a model and a tokeni
    >>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

-    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-    ...            "close to the Manhattan Bridge."
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, " \
+    ...            "therefore very close to the Manhattan Bridge."

-    >>> # Bit of a hack to get the tokens with the special tokens
-    >>> tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
-    >>> inputs = tokenizer.encode(sequence, return_tensors="tf")
+    >>> inputs = tokenizer(sequence, return_tensors="tf")
+    >>> tokens = inputs.tokens()

-    >>> outputs = model(inputs)[0]
+    >>> outputs = model(**inputs)[0]
    >>> predictions = tf.argmax(outputs, axis=2)


@@ -755,8 +752,7 @@ illustrated below:
    (',', 'O')
    ('therefore', 'O')
    ('very', 'O')
-    ('##c', 'O')
-    ('##lose', 'O')
+    ('close', 'O')
    ('to', 'O')
    ('the', 'O')
    ('Manhattan', 'I-LOC')
@@ -764,6 +760,7 @@ illustrated below:
    ('.', 'O')
    ('[SEP]', 'O')

+
 Summarization
 -----------------------------------------------------------------------------------------------------------------------

@@ -811,7 +808,9 @@ below. This outputs the following summary:
 .. code-block::

    >>> print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
-    [{'summary_text': 'Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and 2002. She is believed to still be married to four men.'}]
+    [{'summary_text': ' Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in
+    the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and
+    2002 . At one time, she was married to eight men at once, prosecutors say .'}]

 Here is an example of doing summarization using a model and a tokenizer. The process is the following:

@@ -833,8 +832,15 @@ CNN / Daily Mail), it yields very good results.
    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")

    >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
-    >>> inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="pt", max_length=512, truncation=True)
-    >>> outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
+    >>> inputs = tokenizer("summarize: " + ARTICLE, return_tensors="pt", max_length=512, truncation=True)
+    >>> outputs = model.generate(
+    ...     inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
+    ... )
+
+    >>> print(tokenizer.decode(outputs[0]))
+    <pad> prosecutors say the marriages were part of an immigration scam. if convicted, barrientos faces two criminal
+    counts of "offering a false instrument for filing in the first degree" she has been married 10 times, nine of them
+    between 1999 and 2002.</s>
    >>> ## TENSORFLOW CODE
    >>> from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer

@@ -842,13 +848,15 @@ CNN / Daily Mail), it yields very good results.
    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")

    >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
-    >>> inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
-    >>> outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
-
-.. code-block::
+    >>> inputs = tokenizer("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
+    >>> outputs = model.generate(
+    ...     inputs["input_ids"], max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True
+    ... )

    >>> print(tokenizer.decode(outputs[0]))
-    <pad> prosecutors say the marriages were part of an immigration scam. if convicted, barrientos faces two criminal counts of "offering a false instrument for filing in the first degree" she has been married 10 times, nine of them between 1999 and 2002.</s>
+    <pad> prosecutors say the marriages were part of an immigration scam. if convicted, barrientos faces two criminal
+    counts of "offering a false instrument for filing in the first degree" she has been married 10 times, nine of them
+    between 1999 and 2002.


 Translation
@@ -861,7 +869,7 @@ translation task, you may leverage the `run_translation.py
 An example of a translation dataset is the WMT English to German dataset, which has sentences in English as the input
 data and the corresponding sentences in German as the target data. If you would like to fine-tune a model on a
 translation task, various approaches are described in this :prefix_link:`document
-<examples/pytorch.translation/README.md>`.
+<examples/pytorch/translation/README.md>`.

 Here is an example of using the pipelines to do translation. It leverages a T5 model that was only pre-trained on a
 multi-task mixture dataset (including WMT), yet, yielding impressive translation results.
@@ -888,25 +896,32 @@ Here is an example of doing translation using a model and a tokenizer. The proce
 .. code-block::

    >>> ## PYTORCH CODE
-    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

-    >>> model = AutoModelWithLMHead.from_pretrained("t5-base")
+    >>> model = AutoModelForSeq2SeqLM.from_pretrained("t5-base")
    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")

-    >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="pt")
-    >>> outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
-    >>> ## TENSORFLOW CODE
-    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
-
-    >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base")
-    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
-
-    >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="tf")
-    >>> outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
-
-As with the pipeline example, we get the same translation:
-
-.. code-block::
+    >>> inputs = tokenizer(
+    ...     "translate English to German: Hugging Face is a technology company based in New York and Paris",
+    ...     return_tensors="pt"
+    ... )
+    >>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)

    >>> print(tokenizer.decode(outputs[0]))
-    Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.
+    <pad> Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.</s>
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelForSeq2SeqLM, AutoTokenizer
+
+    >>> model = TFAutoModelForSeq2SeqLM.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    >>> inputs = tokenizer(
+    ...     "translate English to German: Hugging Face is a technology company based in New York and Paris",
+    ...     return_tensors="tf"
+    ... )
+    >>> outputs = model.generate(inputs["input_ids"], max_length=40, num_beams=4, early_stopping=True)
+
+    >>> print(tokenizer.decode(outputs[0]))
+    <pad> Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.
+
+We get the same translation as with the pipeline example.
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -1080,6 +1080,8 @@ If you need to capture both streams at once, use the parent :obj:`CaptureStd` cl
        function_that_writes_to_stdout_and_stderr()
    print(cs.err, cs.out)

+Also, to aid debugging test issues, by default these context managers automatically replay the captured streams on exit
+from the context.


 Capturing logger stream
--- a/docs/source/tokenizer_summary.rst
+++ b/docs/source/tokenizer_summary.rst
@@ -182,9 +182,10 @@ base vocabulary, we obtain:

 BPE then counts the frequency of each possible symbol pair and picks the symbol pair that occurs most frequently. In
 the example above ``"h"`` followed by ``"u"`` is present `10 + 5 = 15` times (10 times in the 10 occurrences of
-``"hug"``, 5 times in the 5 occurrences of "hugs"). However, the most frequent symbol pair is ``"u"`` followed by "g",
-occurring `10 + 5 + 5 = 20` times in total. Thus, the first merge rule the tokenizer learns is to group all ``"u"``
-symbols followed by a ``"g"`` symbol together. Next, "ug" is added to the vocabulary. The set of words then becomes
+``"hug"``, 5 times in the 5 occurrences of ``"hugs"``). However, the most frequent symbol pair is ``"u"`` followed by
+``"g"``, occurring `10 + 5 + 5 = 20` times in total. Thus, the first merge rule the tokenizer learns is to group all
+``"u"`` symbols followed by a ``"g"`` symbol together. Next, ``"ug"`` is added to the vocabulary. The set of words then
+becomes

 .. code-block::

--- a/docs/source/training.rst
+++ b/docs/source/training.rst
@@ -33,7 +33,7 @@ Preparing the datasets
   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
   picture-in-picture" allowfullscreen></iframe>

-We will use the `🤗 Datasets <https:/github.com/huggingface/datasets/>`__ library to download and preprocess the IMDB
+We will use the `🤗 Datasets <https://github.com/huggingface/datasets/>`__ library to download and preprocess the IMDB
 datasets. We will go over this part pretty quickly. Since the focus of this tutorial is on training, you should refer
 to the 🤗 Datasets `documentation <https://huggingface.co/docs/datasets/>`__ or the :doc:`preprocessing` tutorial for
 more information.
@@ -240,11 +240,11 @@ Then we convert everything in big tensors and use the :obj:`tf.data.Dataset.from

 .. code-block:: python

-    train_features = {x: tf_train_dataset[x].to_tensor() for x in tokenizer.model_input_names}
+    train_features = {x: tf_train_dataset[x] for x in tokenizer.model_input_names}
    train_tf_dataset = tf.data.Dataset.from_tensor_slices((train_features, tf_train_dataset["label"]))
    train_tf_dataset = train_tf_dataset.shuffle(len(tf_train_dataset)).batch(8)

-    eval_features = {x: tf_eval_dataset[x].to_tensor() for x in tokenizer.model_input_names}
+    eval_features = {x: tf_eval_dataset[x] for x in tokenizer.model_input_names}
    eval_tf_dataset = tf.data.Dataset.from_tensor_slices((eval_features, tf_eval_dataset["label"]))
    eval_tf_dataset = eval_tf_dataset.batch(8)

@@ -281,7 +281,7 @@ Fine-tuning in native PyTorch
   frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope;
   picture-in-picture" allowfullscreen></iframe>

-You might need to restart your notebook at this stage to free some memory, or excute the following code:
+You might need to restart your notebook at this stage to free some memory, or execute the following code:

 .. code-block:: python

@@ -335,7 +335,7 @@ scheduler. The default optimizer used by the :class:`~transformers.Trainer` is :

    optimizer = AdamW(model.parameters(), lr=5e-5)

-Finally, the learning rate scheduler used by default it just a linear decay form the maximum value (5e-5 here) to 0:
+Finally, the learning rate scheduler used by default is just a linear decay from the maximum value (5e-5 here) to 0:

 .. code-block:: python

--- a/examples/README.md
+++ b/examples/README.md
@@ -42,6 +42,7 @@ To browse the examples corresponding to released versions of 🤗 Transformers,

 <details>
  <summary>Examples for older versions of 🤗 Transformers</summary>
+
  - [v4.5.1](https://github.com/huggingface/transformers/tree/v4.5.1/examples)
  - [v4.4.2](https://github.com/huggingface/transformers/tree/v4.4.2/examples)
  - [v4.3.3](https://github.com/huggingface/transformers/tree/v4.3.3/examples)
@@ -71,7 +72,7 @@ To browse the examples corresponding to released versions of 🤗 Transformers,
  - [v1.0.0](https://github.com/huggingface/transformers/tree/v1.0.0/examples)
 </details>

-Alternatively, you can find switch your cloned 🤗 Transformers to a specific version (for instance with v3.5.1) with
+Alternatively, you can switch your cloned 🤗 Transformers to a specific version (for instance with v3.5.1) with
 ```bash
 git checkout tags/v3.5.1
 ```
--- a/examples/flax/README.md
+++ b/examples/flax/README.md
@@ -46,6 +46,8 @@ module abstraction using Python dataclasses that leads to concise and explicit c
 All of our JAX/Flax models are designed to run efficiently on Google
 Cloud TPUs. Here is [a guide for running JAX on Google Cloud TPU](https://cloud.google.com/tpu/docs/jax-quickstart-tpu-vm).

+Consider applying for the [Google TPU Research Cloud project](https://sites.research.google/trc/) for free TPU compute.
+
 Each example README contains more details on the specific model and training
 procedure.

@@ -59,3 +61,14 @@ For a complete overview of models that are supported in JAX/Flax, please have a

 Over 3000 pretrained checkpoints are supported in JAX/Flax as of May 2021.
 Click [here](https://huggingface.co/models?filter=jax) to see the full list on the 🤗 hub.
+
+## Upload the trained/fine-tuned model to the Hub
+
+All the example scripts support automatic upload of your final model to the [Model Hub](https://huggingface.co/models) by adding a `--push_to_hub` argument. It will then create a repository with your username slash the name of the folder you are using as `output_dir`. For instance, `"sgugger/test-mrpc"` if your username is `sgugger` and you are working in the folder `~/tmp/test-mrpc`.
+
+To specify a given repository name, use the `--hub_model_id` argument. You will need to specify the whole repository name (including your username), for instance `--hub_model_id sgugger/finetuned-bert-mrpc`. To upload to an organization you are a member of, just use the name of that organization instead of your username: `--hub_model_id huggingface/finetuned-bert-mrpc`.
+
+A few notes on this integration:
+
+- you will need to be logged in to the Hugging Face website locally for it to work, the easiest way to achieve this is to run `huggingface-cli login` and then type your username and password when prompted. You can also pass along your authentication token with the `--hub_token` argument.
+- the `output_dir` you pick will either need to be a new folder or a local clone of the distant repository you are using.
--- a/examples/flax/language-modeling/README.md
+++ b/examples/flax/language-modeling/README.md
@@ -33,53 +33,23 @@ in Norwegian on a single TPUv3-8 pod.

 The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.

-Let's start by creating a model repository to save the trained model and logs.
-Here we call the model `"norwegian-roberta-base"`, but you can change the model name as you like.
-
-You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
-you are logged in) or via the command line:
-
-```
-huggingface-cli repo create norwegian-roberta-base
-```
-
-Next we clone the model repository to add the tokenizer and model files.
-
-```
-git clone https://huggingface.co/<your-username>/norwegian-roberta-base
-```
-
-To ensure that all tensorboard traces will be uploaded correctly, we need to 
-track them. You can run the following command inside your model repo to do so.
-
-```
-cd norwegian-roberta-base
-git lfs track "*tfevents*"
-```
-
-Great, we have set up our model repository. During training, we will automatically
-push the training logs and model weights to the repo.
-
-Next, let's add a symbolic link to the `run_mlm_flax.py`.
+To setup all relevant files for training, let's create a directory.

 ```bash
-export MODEL_DIR="./norwegian-roberta-base"
-ln -s ~/transformers/examples/flax/language-modeling/run_mlm_flax.py run_mlm_flax.py
+mkdir ./norwegian-roberta-base
 ```

 ### Train tokenizer

 In the first step, we train a tokenizer to efficiently process the text input for the model. Similar to how it is shown in [How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train), we use a **`ByteLevelBPETokenizer`**.
 The tokenizer is trained on the complete Norwegian dataset of OSCAR
-and consequently saved in `${MODEL_DIR}`
+and consequently saved in the cloned model directory.
 This can take up to 10 minutes depending on your hardware ☕.

 ```python
 from datasets import load_dataset
 from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer

-model_dir = "./norwegian-roberta-base"  # ${MODEL_DIR}
-
 # load dataset
 dataset = load_dataset("oscar", "unshuffled_deduplicated_no", split="train")

@@ -100,7 +70,7 @@ tokenizer.train_from_iterator(batch_iterator(), vocab_size=50265, min_frequency=
 ])

 # Save files to disk
-tokenizer.save(f"{model_dir}/tokenizer.json")
+tokenizer.save("./norwegian-roberta-base/tokenizer.json")
 ```

 ### Create configuration
@@ -112,22 +82,23 @@ in the local model folder:
 ```python
 from transformers import RobertaConfig

-model_dir = "./norwegian-roberta-base"  # ${MODEL_DIR}
-
-config = RobertaConfig.from_pretrained("roberta-base", vocab_size=tokenizer.vocab_size)
-config.save_pretrained(model_dir)
+config = RobertaConfig.from_pretrained("roberta-base", vocab_size=50265)
+config.save_pretrained("./norwegian-roberta-base")
 ```

+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
 ### Train model

 Next we can run the example script to pretrain the model:

 ```bash
-./run_mlm_flax.py \
-    --output_dir="${MODEL_DIR}" \
+python run_mlm_flax.py \
+    --output_dir="./norwegian-roberta-base" \
    --model_type="roberta" \
-    --config_name="${MODEL_DIR}" \
-    --tokenizer_name="${MODEL_DIR}" \
+    --config_name="./norwegian-roberta-base" \
+    --tokenizer_name="./norwegian-roberta-base" \
    --dataset_name="oscar" \
    --dataset_config_name="unshuffled_deduplicated_no" \
    --max_seq_length="128" \
@@ -149,7 +120,7 @@ Next we can run the example script to pretrain the model:
 Training should converge at a loss and accuracy 
 of 1.78 and 0.64 respectively after 18 epochs on a single TPUv3-8.
 This should take less than 18 hours.
-Training statistics can be accessed on [tfhub.de](https://tensorboard.dev/experiment/GdYmdak2TWeVz0DDRYOrrg).
+Training statistics can be accessed on [tfhub.dev](https://tensorboard.dev/experiment/GdYmdak2TWeVz0DDRYOrrg).

 For a step-by-step walkthrough of how to do masked language modeling in Flax, please have a 
 look at [this](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/masked_language_modeling_flax.ipynb) google colab.
@@ -164,41 +135,46 @@ in Norwegian on a single TPUv3-8 pod.

 The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.

-Let's start by creating a model repository to save the trained model and logs.
-Here we call the model `"norwegian-gpt2"`, but you can change the model name as you like.

-You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
-you are logged in) or via the command line:
-
-```
-huggingface-cli repo create norwegian-gpt2
-```
-
-Next we clone the model repository to add the tokenizer and model files.
-
-```
-git clone https://huggingface.co/<your-username>/norwegian-gpt2
-```
-
-To ensure that all tensorboard traces will be uploaded correctly, we need to 
-track them. You can run the following command inside your model repo to do so.
-
-```
-cd norwegian-gpt2
-git lfs track "*tfevents*"
-```
-
-Great, we have set up our model repository. During training, we will automatically
-push the training logs and model weights to the repo.
-
-Next, let's add a symbolic link to the `run_clm_flax.py`.
+To setup all relevant files for training, let's create a directory.

 ```bash
-export MODEL_DIR="./norwegian-gpt2"
-ln -s ~/transformers/examples/flax/language-modeling/run_clm_flax.py run_clm_flax.py
+mkdir ./norwegian-gpt2
 ```

-Next, we'll follow the same steps as above in [Train tokenizer](#train-tokenizer) to train the tokenizer.
+### Train tokenizer
+
+In the first step, we train a tokenizer to efficiently process the text input for the model. Similar to how it is shown in [How to train a new language model from scratch using Transformers and Tokenizers](https://huggingface.co/blog/how-to-train), we use a **`ByteLevelBPETokenizer`**.
+The tokenizer is trained on the complete Norwegian dataset of OSCAR
+and consequently saved in the cloned model directory.
+This can take up to 10 minutes depending on your hardware ☕.
+
+```python
+from datasets import load_dataset
+from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer
+
+# load dataset
+dataset = load_dataset("oscar", "unshuffled_deduplicated_no", split="train")
+
+# Instantiate tokenizer
+tokenizer = ByteLevelBPETokenizer()
+
+def batch_iterator(batch_size=1000):
+    for i in range(0, len(dataset), batch_size):
+        yield dataset[i: i + batch_size]["text"]
+
+# Customized training
+tokenizer.train_from_iterator(batch_iterator(), vocab_size=50257, min_frequency=2, special_tokens=[
+    "<s>",
+    "<pad>",
+    "</s>",
+    "<unk>",
+    "<mask>",
+])
+
+# Save files to disk
+tokenizer.save("./norwegian-gpt2/tokenizer.json")
+```

 ### Create configuration

@@ -209,22 +185,23 @@ in the local model folder:
 ```python
 from transformers import GPT2Config

-model_dir = "./norwegian-gpt2"  # ${MODEL_DIR}
-
-config = GPT2Config.from_pretrained("gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0, vocab_size=tokenizer.vocab_size)
-config.save_pretrained(model_dir)
+config = GPT2Config.from_pretrained("gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0, vocab_size=50257)
+config.save_pretrained("./norwegian-gpt2")
 ```

+Great, we have set up our model repository. During training, we will now automatically
+push the training logs and model weights to the repo.
+
 ### Train model

-Next we can run the example script to pretrain the model:
+Finally, we can run the example script to pretrain the model:

 ```bash
-./run_clm_flax.py \
-    --output_dir="${MODEL_DIR}" \
+python run_clm_flax.py \
+    --output_dir="./norwegian-gpt2" \
    --model_type="gpt2" \
-    --config_name="${MODEL_DIR}" \
-    --tokenizer_name="${MODEL_DIR}" \
+    --config_name="./norwegian-gpt2" \
+    --tokenizer_name="./norwegian-gpt2" \
    --dataset_name="oscar" \
    --dataset_config_name="unshuffled_deduplicated_no" \
    --do_train --do_eval \
@@ -246,6 +223,9 @@ of 3.24 and 25.72 respectively after 20 epochs on a single TPUv3-8.
 This should take less than ~21 hours.
 Training statistics can be accessed on [tfhub.de](https://tensorboard.dev/experiment/2zEhLwJ0Qp2FAkI3WVH9qA).

+For a step-by-step walkthrough of how to do causal language modeling in Flax, please have a 
+look at [this](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/causal_language_modeling_flax.ipynb) google colab.
+
 ## T5-like span-masked language modeling

 In the following, we demonstrate how to train a T5 model using the span-masked language model 
@@ -259,36 +239,10 @@ The example script uses the 🤗 Datasets library. You can easily customize them
 Let's start by creating a model repository to save the trained model and logs.
 Here we call the model `"norwegian-t5-base"`, but you can change the model name as you like.

-You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
-you are logged in) or via the command line:
-
-```
-huggingface-cli repo create norwegian-t5-base
-```
-
-Next we clone the model repository to add the tokenizer and model files.
-
-```
-git clone https://huggingface.co/<your-username>/norwegian-t5-base
-```
-
-To ensure that all tensorboard traces will be uploaded correctly, we need to 
-track them. You can run the following command inside your model repo to do so.
-
-```
-cd norwegian-t5-base
-git lfs track "*tfevents*"
-```
-
-Great, we have set up our model repository. During training, we will automatically
-push the training logs and model weights to the repo.
-
-Next, let's add a symbolic link to the `run_t5_mlm_flax.py` and `t5_tokenizer_model` scripts.
+To setup all relevant files for trairing, let's create a directory.

 ```bash
-export MODEL_DIR="./norwegian-t5-base"
-ln -s ~/transformers/examples/flax/language-modeling/run_t5_mlm_flax.py run_t5_mlm_flax.py
-ln -s ~/transformers/examples/flax/language-modeling/t5_tokenizer_model.py t5_tokenizer_model.py
+cd ./norwegian-t5-base
 ```

 ### Train tokenizer
@@ -299,7 +253,7 @@ a sentencepiece unigram tokenizer as shown in [t5_tokenizer_model.py](https://gi
 which is heavily inspired from [yandex-research/DeDLOC's tokenizer model](https://github.com/yandex-research/DeDLOC/blob/5c994bc64e573702a9a79add3ecd68b38f14b548/sahajbert/tokenizer/tokenizer_model.py) .

 The tokenizer is trained on the complete Norwegian dataset of OSCAR
-and consequently saved in `${MODEL_DIR}`
+and consequently saved in the cloned model directory.
 This can take up to 120 minutes depending on your hardware ☕☕☕ .

 ```python
@@ -310,7 +264,6 @@ from t5_tokenizer_model import SentencePieceUnigramTokenizer

 vocab_size = 32_000
 input_sentence_size = None
-model_dir = "./norwegian-t5-base"  # ${MODEL_DIR}

 # Initialize a dataset
 dataset = datasets.load_dataset("oscar", name="unshuffled_deduplicated_no", split="train")
@@ -335,7 +288,7 @@ tokenizer.train_from_iterator(
 )

 # Save files to disk
-tokenizer.save(f"{model_dir}/tokenizer.json")
+tokenizer.save("./norwegian-t5-base/tokenizer.json")
 ```

 ### Create configuration
@@ -347,22 +300,23 @@ in the local model folder:
 ```python
 from transformers import T5Config

-model_dir = "./norwegian-t5-base"  # ${MODEL_DIR}
-
-config = T5Config.from_pretrained("google/t5-v1_1-base", vocab_size=tokenizer.vocab_size)
-config.save_pretrained(model_dir)
+config = T5Config.from_pretrained("google/t5-v1_1-base", vocab_size=tokenizer.get_vocab_size())
+config.save_pretrained("./norwegian-t5-base")
 ```

+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
 ### Train model

 Next we can run the example script to pretrain the model:

 ```bash
-./run_t5_mlm_flax.py \
-	--output_dir="./" \
+python run_t5_mlm_flax.py \
+	--output_dir="./norwegian-t5-base" \
 	--model_type="t5" \
-	--config_name="./" \
-	--tokenizer_name="./" \
+	--config_name="./norwegian-t5-base" \
+	--tokenizer_name="./norwegian-t5-base" \
 	--dataset_name="oscar" \
 	--dataset_config_name="unshuffled_deduplicated_no" \
 	--max_seq_length="512" \
@@ -373,15 +327,15 @@ Next we can run the example script to pretrain the model:
 	--weight_decay="0.001" \
 	--warmup_steps="2000" \
 	--overwrite_output_dir \
-	--logging_steps="100" \
-	--save_steps="1000" \
-	--eval_steps="1000" \
+	--logging_steps="500" \
+	--save_steps="10000" \
+	--eval_steps="2500" \
 	--push_to_hub
 ```

 Training should converge at a loss and accuracy 
-of 2.2 and 58.0 respectively after 2 epochs on a single TPUv3-8.
-This should take around 24 hours.
+of 2.36 and 57.0 respectively after 3 epochs on a single TPUv3-8.
+This should take around 4.5 hours.
 Training statistics can be accessed on directly on the 🤗 [hub](https://huggingface.co/patrickvonplaten/t5-base-norwegian/tensorboard)

 ## Runtime evaluation
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -31,6 +31,7 @@ from pathlib import Path
 from typing import Callable, Optional

 import datasets
+import numpy as np
 from datasets import Dataset, load_dataset
 from tqdm import tqdm

@@ -42,6 +43,7 @@ from flax import jax_utils, traverse_util
 from flax.jax_utils import unreplicate
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
+from huggingface_hub import Repository
 from transformers import (
    CONFIG_MAPPING,
    FLAX_MODEL_FOR_CAUSAL_LM_MAPPING,
@@ -51,7 +53,9 @@ from transformers import (
    HfArgumentParser,
    TrainingArguments,
    is_tensorboard_available,
+    set_seed,
 )
+from transformers.file_utils import get_full_repo_name
 from transformers.testing_utils import CaptureLogger


@@ -154,6 +158,9 @@ class DataTrainingArguments:
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
+    keep_linebreaks: bool = field(
+        default=True, metadata={"help": "Whether to keep line breaks when using TXT files or not."}
+    )

    def __post_init__(self):
        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
@@ -182,18 +189,16 @@ def data_loader(rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuf
    steps_per_epoch = len(dataset) // batch_size

    if shuffle:
-        batch_idx = jax.random.permutation(rng, len(dataset))
+        batch_idx = np.random.permutation(len(dataset))
    else:
-        batch_idx = jnp.arange(len(dataset))
+        batch_idx = np.arange(len(dataset))

    batch_idx = batch_idx[: steps_per_epoch * batch_size]  # Skip incomplete batch.
    batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))

    for idx in batch_idx:
        batch = dataset[idx]
-        batch = {k: jnp.array(v) for k, v in batch.items()}
-
-        batch = shard(batch)
+        batch = {k: np.array(v) for k, v in batch.items()}

        yield batch

@@ -269,6 +274,19 @@ def main():
    # Set the verbosity to info of the Transformers logger (on main process only):
    logger.info(f"Training/evaluation parameters {training_args}")

+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        repo = Repository(training_args.output_dir, clone_from=repo_name)
+
    #  Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
@@ -299,6 +317,7 @@ def main():
            )
    else:
        data_files = {}
+        dataset_args = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
        if data_args.validation_file is not None:
@@ -306,20 +325,23 @@ def main():
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
-        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+            dataset_args["keep_linebreaks"] = data_args.keep_linebreaks
+        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir, **dataset_args)

-        if "validation" not in datasets.keys():
-            datasets["validation"] = load_dataset(
+        if "validation" not in dataset.keys():
+            dataset["validation"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
+                **dataset_args,
            )
-            datasets["train"] = load_dataset(
+            dataset["train"] = load_dataset(
                extension,
                data_files=data_files,
                split=f"train[{data_args.validation_split_percentage}%:]",
                cache_dir=model_args.cache_dir,
+                **dataset_args,
            )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.
@@ -577,7 +599,7 @@ def main():

    train_time = 0
    train_metrics = []
-    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    epochs = tqdm(range(num_epochs), desc="Epoch ... ", position=0)
    for epoch in epochs:
        # ======================== Training ================================
        train_start = time.time()
@@ -591,6 +613,7 @@ def main():
        # train
        for step in tqdm(range(steps_per_epoch), desc="Training...", position=1, leave=False):
            batch = next(train_loader)
+            batch = shard(batch)
            state, train_metric = p_train_step(state, batch)
            train_metrics.append(train_metric)

@@ -617,6 +640,7 @@ def main():
                for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False):
                    # Model forward
                    batch = next(eval_loader)
+                    batch = shard(batch)
                    metrics = p_eval_step(state.params, batch)
                    eval_metrics.append(metrics)

@@ -642,12 +666,10 @@ def main():
                # save checkpoint after each epoch and push checkpoint to the hub
                if jax.process_index() == 0:
                    params = jax.device_get(unreplicate(state.params))
-                    model.save_pretrained(
-                        training_args.output_dir,
-                        params=params,
-                        push_to_hub=training_args.push_to_hub,
-                        commit_message=f"Saving weights and logs of step {cur_step}",
-                    )
+                    model.save_pretrained(training_args.output_dir, params=params)
+                    tokenizer.save_pretrained(training_args.output_dir)
+                    if training_args.push_to_hub:
+                        repo.push_to_hub(commit_message=f"Saving weights and logs of step {cur_step}", blocking=False)


 if __name__ == "__main__":
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -41,6 +41,7 @@ import optax
 from flax import jax_utils, traverse_util
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
+from huggingface_hub import Repository
 from transformers import (
    CONFIG_MAPPING,
    FLAX_MODEL_FOR_MASKED_LM_MAPPING,
@@ -54,6 +55,7 @@ from transformers import (
    is_tensorboard_available,
    set_seed,
 )
+from transformers.file_utils import get_full_repo_name


 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
@@ -214,7 +216,7 @@ class FlaxDataCollatorForLanguageModeling:

    def mask_tokens(
        self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
-    ) -> Tuple[jnp.ndarray, jnp.ndarray]:
+    ) -> Tuple[np.ndarray, np.ndarray]:
        """
        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
        """
@@ -308,6 +310,16 @@ if __name__ == "__main__":
    # Set seed before initializing model.
    set_seed(training_args.seed)

+    # Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        repo = Repository(training_args.output_dir, clone_from=repo_name)
+
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
@@ -683,9 +695,7 @@ if __name__ == "__main__":
                # save checkpoint after each epoch and push checkpoint to the hub
                if jax.process_index() == 0:
                    params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-                    model.save_pretrained(
-                        training_args.output_dir,
-                        params=params,
-                        push_to_hub=training_args.push_to_hub,
-                        commit_message=f"Saving weights and logs of step {cur_step}",
-                    )
+                    model.save_pretrained(training_args.output_dir, params=params)
+                    tokenizer.save_pretrained(training_args.output_dir)
+                    if training_args.push_to_hub:
+                        repo.push_to_hub(commit_message=f"Saving weights and logs of step {cur_step}", blocking=False)
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -39,6 +39,7 @@ import optax
 from flax import jax_utils, traverse_util
 from flax.training import train_state
 from flax.training.common_utils import get_metrics, onehot, shard
+from huggingface_hub import Repository
 from transformers import (
    CONFIG_MAPPING,
    FLAX_MODEL_FOR_MASKED_LM_MAPPING,
@@ -52,6 +53,7 @@ from transformers import (
    is_tensorboard_available,
    set_seed,
 )
+from transformers.file_utils import get_full_repo_name
 from transformers.models.t5.modeling_flax_t5 import shift_tokens_right


@@ -353,7 +355,8 @@ class FlaxDataCollatorForT5MLM:
            np.random.shuffle(mask_indices)
            first_in_segment = np.pad(mask_indices, [[1, 0]])
            segment_id = np.cumsum(first_in_segment)
-            segment_length = np.asarray(jax.ops.segment_sum(np.ones_like(segment_id), segment_id))
+            # count length of sub segments assuming that list is sorted
+            _, segment_length = np.unique(segment_id, return_counts=True)
            return segment_length

        noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans)
@@ -437,6 +440,16 @@ if __name__ == "__main__":
    # Set seed before initializing model.
    set_seed(training_args.seed)

+    # Handle the repository creation
+    if training_args.push_to_hub:
+        if training_args.hub_model_id is None:
+            repo_name = get_full_repo_name(
+                Path(training_args.output_dir).absolute().name, token=training_args.hub_token
+            )
+        else:
+            repo_name = training_args.hub_model_id
+        repo = Repository(training_args.output_dir, clone_from=repo_name)
+
    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
    # (the dataset will be downloaded automatically from the datasets Hub).
@@ -720,7 +733,7 @@ if __name__ == "__main__":
    state = jax_utils.replicate(state)

    train_time = 0
-    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    epochs = tqdm(range(num_epochs), desc="Epoch ... ", position=0)
    for epoch in epochs:
        # ======================== Training ================================
        train_start = time.time()
@@ -790,9 +803,7 @@ if __name__ == "__main__":
                # save checkpoint after each epoch and push checkpoint to the hub
                if jax.process_index() == 0:
                    params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-                    model.save_pretrained(
-                        training_args.output_dir,
-                        params=params,
-                        push_to_hub=training_args.push_to_hub,
-                        commit_message=f"Saving weights and logs of step {cur_step}",
-                    )
+                    model.save_pretrained(training_args.output_dir, params=params)
+                    tokenizer.save_pretrained(training_args.output_dir)
+                    if training_args.push_to_hub:
+                        repo.push_to_hub(commit_message=f"Saving weights and logs of step {cur_step}", blocking=False)
--- a/examples/flax/question-answering/README.md
+++ b/examples/flax/question-answering/README.md
@@ -0,0 +1,104 @@
+<!---
+Copyright 2021 The Google Flax Team Authors and HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Question Answering examples
+
+Based on the script [`run_qa.py`](https://github.com/huggingface/transformers/blob/master/examples/flax/question-answering/run_qa.py).
+
+**Note:** This script only works with models that have a fast tokenizer (backed by the 🤗 Tokenizers library) as it
+uses special features of those tokenizers. You can check if your favorite model has a fast tokenizer in
+[this table](https://huggingface.co/transformers/index.html#supported-frameworks), if it doesn't you can still use the old version
+of the script.
+
+
+The following example fine-tunes BERT on SQuAD:
+
+
+```bash
+python run_qa.py \
+  --model_name_or_path bert-base-uncased \
+  --dataset_name squad \
+  --do_train   \
+  --do_eval   \
+  --max_seq_length 384 \
+  --doc_stride 128 \
+  --learning_rate 3e-5 \
+  --num_train_epochs 2 \
+  --per_device_train_batch_size 12 \
+  --output_dir ./bert-qa-squad \
+  --eval_steps 1000 \
+  --push_to_hub
+```
+
+Using the command above, the script will train for 2 epochs and run eval after each epoch. 
+Metrics and hyperparameters are stored in Tensorflow event files in `--output_dir`.
+You can see the results by running `tensorboard` in that directory:
+
+```bash
+$ tensorboard --logdir .
+```
+
+or directly on the hub under *Training metrics*.
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 88.62
+exact_match = 81.34
+```
+
+sample Metrics - [tfhub.dev](https://tensorboard.dev/experiment/6gU75Hx8TGCnc6tr4ZgI9Q)
+
+Here is an example training on 4 TITAN RTX GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD1.1:
+
+```bash
+export CUDA_VISIBLE_DEVICES=0,1,2,3
+python run_qa.py   \
+--model_name_or_path bert-large-uncased-whole-word-masking   \
+--dataset_name squad   \
+--do_train   \
+--do_eval   \
+--per_device_train_batch_size 6   \
+--learning_rate 3e-5   \
+--num_train_epochs 2   \
+--max_seq_length 384   \
+--doc_stride 128   \
+--output_dir ./wwm_uncased_finetuned_squad/ \
+--eval_steps 1000 \
+--push_to_hub
+```
+
+Training with the previously defined hyper-parameters yields the following results:
+
+```bash
+f1 = 93.31
+exact_match = 87.04
+```
+
+
+### Usage notes
+
+Note that when contexts are long they may be split into multiple training cases, not all of which may contain
+the answer span. 
+
+As-is, the example script will train on SQuAD or any other question-answering dataset formatted the same way, and can handle user
+inputs as well.
+
+### Memory usage and data loading
+
+One thing to note is that all data is loaded into memory in this script. Most question answering datasets are small
+enough that this is not an issue, but if you have a very large dataset you will need to modify the script to handle
+data streaming.
--- a/examples/flax/question-answering/requirements.txt
+++ b/examples/flax/question-answering/requirements.txt
@@ -0,0 +1,5 @@
+datasets >= 1.8.0
+jax>=0.2.17
+jaxlib>=0.1.68
+flax>=0.3.4
+optax>=0.0.8
--- a/Show More
+++ b/Show More