Patch release: v4.9.2

Tpu tie weights (#13030 )
* Fix tied weights on TPU * Manually tie weights in no trainer examples * Fix for test * One last missing * Gettning owned by my scripts * Address review comments * Fix test * Fix tests * Fix reformer tests
2021-08-09 16:01:36 +02:00 · 2021-08-09 15:53:05 +02:00 · 2021-08-09 15:52:16 +02:00 · 2021-08-09 15:52:07 +02:00 · 2021-08-09 15:51:04 +02:00 · 2021-08-09 15:50:58 +02:00
470 changed files with 32908 additions and 13119 deletions
--- a/.circleci/TROUBLESHOOT.md
+++ b/.circleci/TROUBLESHOOT.md
@@ -0,0 +1,7 @@
+# Troubleshooting
+
+This is a document explaining how to deal with various issues on Circle-CI. The entries may include actually solutions or pointers to Issues that cover those.
+
+## Circle CI
+
+* pytest worker runs out of resident RAM and gets killed by `cgroups`: https://github.com/huggingface/transformers/issues/11408
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -86,7 +86,13 @@ jobs:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
                    - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf ./tests/ -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf $(cat test_list.txt) -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
+                  fi
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -116,7 +122,13 @@ jobs:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
                    - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax ./tests/ -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_flax $(cat test_list.txt) -m is_pt_flax_cross_test --durations=0 | tee tests_output.txt
+                  fi
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -145,7 +157,13 @@ jobs:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 3 --dist=loadfile -s --make-reports=tests_torch ./tests/ | tee tests_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 3 --dist=loadfile -s --make-reports=tests_torch $(cat test_list.txt) | tee tests_output.txt
+                  fi
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -172,7 +190,13 @@ jobs:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_tf ./tests/ | tee tests_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_tf $(cat test_list.txt) | tee tests_output.txt
+                  fi
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -199,7 +223,13 @@ jobs:
                  key: v0.4-flax-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_flax ./tests/ | tee tests_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_flax $(cat test_list.txt) | tee tests_output.txt
+                  fi
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -229,7 +259,13 @@ jobs:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test ./tests/ | tee tests_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test $(cat test_list.txt) | tee tests_output.txt
+                  fi
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -257,7 +293,13 @@ jobs:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf ./tests/ -m is_pipeline_test | tee tests_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf $(cat test_list.txt) -m is_pipeline_test | tee tests_output.txt
+                  fi
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -283,7 +325,10 @@ jobs:
                  key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py | tee tests_output.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py | tee tests_output.txt
+                  fi
            - store_artifacts:
                  path: ~/transformers/tests_output.txt
            - store_artifacts:
@@ -311,7 +356,13 @@ jobs:
                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee examples_output.txt
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    TRANSFORMERS_IS_CI=1 python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/pytorch/ | tee examples_output.txt
+                  fi
            - store_artifacts:
                  path: ~/transformers/examples_output.txt
            - store_artifacts:
@@ -343,12 +394,51 @@ jobs:
                  key: v0.4-hub-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -sv ./tests/ -m is_staging_test
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -sv $(cat test_list.txt) -m is_staging_test
+                  fi

+    run_tests_onnxruntime:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+            TRANSFORMERS_IS_CI: yes
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[torch,testing,sentencepiece,onnxruntime]
+            - save_cache:
+                  key: v0.4-onnx-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python utils/tests_fetcher.py | tee test_preparation.txt
+            - store_artifacts:
+                  path: ~/transformers/test_preparation.txt
+            - run: |
+                  if [ -f test_list.txt ]; then
+                    python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch $(cat test_list.txt) -k onnx | tee tests_output.txt
+                  fi
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
    build_doc:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.6
+        resource_class: large
        steps:
            - checkout
            - restore_cache:
@@ -370,6 +460,7 @@ jobs:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.6
+        resource_class: large
        steps:
            - add_ssh_keys:
                fingerprints:
@@ -392,7 +483,7 @@ jobs:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.6
-        resource_class: medium
+        resource_class: large
        environment:
            TRANSFORMERS_IS_CI: yes
        parallelism: 1
@@ -403,7 +494,7 @@ jobs:
                      - v0.4-code_quality-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install isort
+            - run: pip install isort GitPython
            - run: pip install .[all,quality]
            - save_cache:
                  key: v0.4-code_quality-{{ checksum "setup.py" }}
@@ -419,6 +510,8 @@ jobs:
            - run: python utils/check_dummies.py
            - run: python utils/check_repo.py
            - run: python utils/check_inits.py
+            - run: make deps_table_check_updated
+            - run: python utils/tests_fetcher.py --sanity_check

    check_repository_consistency:
        working_directory: ~/transformers
@@ -482,6 +575,7 @@ workflows:
            - run_tests_flax
            - run_tests_pipelines_torch
            - run_tests_pipelines_tf
+            - run_tests_onnxruntime
            - run_tests_hub
            - build_doc
            - deploy_doc: *workflow_filters
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -64,4 +64,8 @@ deploy_doc "6bc89ed" v4.4.2
 deploy_doc "4906a29" v4.5.0
 deploy_doc "4bae96e" v4.5.1
 deploy_doc "25dee4a" v4.6.0
-deploy_doc "7a6c9fa"  # v4.7.0 Latest stable release
+deploy_doc "7a6c9fa" v4.7.0
+deploy_doc "9252a51" v4.8.0
+deploy_doc "1366172" v4.8.1
+deploy_doc "96d1cfb" v4.8.2
+deploy_doc "72aee83" # v4.9.0 Latest stable release
--- a/.github/conda/meta.yaml
+++ b/.github/conda/meta.yaml
@@ -26,7 +26,7 @@ requirements:
    - regex !=2019.12.17
    - protobuf
    - tokenizers >=0.10.1,<0.11.0
-    - pyyaml
+    - pyyaml >=5.1
  run:
    - python
    - numpy >=1.17
@@ -41,7 +41,7 @@ requirements:
    - regex !=2019.12.17
    - protobuf
    - tokenizers >=0.10.1,<0.11.0
-    - pyyaml
+    - pyyaml >=5.1

 test:
  imports:
--- a/.github/workflows/TROUBLESHOOT.md
+++ b/.github/workflows/TROUBLESHOOT.md
@@ -0,0 +1,9 @@
+# Troubleshooting
+
+This is a document explaining how to deal with various issues on github-actions self-hosted CI. The entries may include actually solutions or pointers to Issues that cover those.
+
+## GitHub Actions (self-hosted CI)
+
+* Deepspeed
+
+  - if jit build hangs, clear out `rm -rf ~/.cache/torch_extensions/` reference: https://github.com/huggingface/transformers/pull/12723
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -18,6 +18,7 @@ env:
  TRANSFORMERS_IS_CI: yes
  OMP_NUM_THREADS: 8
  MKL_NUM_THREADS: 8
+  PYTEST_TIMEOUT: 60

 jobs:
  run_tests_torch_gpu:
@@ -48,7 +49,7 @@ jobs:

      - name: Run all non-slow tests on GPU
        run: |
-          python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_gpu tests
+          python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -61,47 +62,47 @@ jobs:
          name: run_all_tests_torch_gpu_test_reports
          path: reports

-  run_tests_tf_gpu:
-    runs-on: [self-hosted, docker-gpu, single-gpu]
-    timeout-minutes: 120
-    container:
-      image: tensorflow/tensorflow:2.4.1-gpu
-      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece]
-
-      - name: Are GPUs recognized by our DL frameworks
-        run: |
-          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
-          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
-
-      - name: Run all non-slow tests on GPU
-        env:
-          TF_NUM_INTRAOP_THREADS: 8
-          TF_NUM_INTEROP_THREADS: 1
-        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests
-
-      - name: Failure short reports
-        if: ${{ always() }}
-        run: cat reports/tests_tf_gpu_failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v2
-        with:
-          name: run_all_tests_tf_gpu_test_reports
-          path: reports
+#  run_tests_tf_gpu:
+#    runs-on: [self-hosted, docker-gpu, single-gpu]
+#    timeout-minutes: 120
+#    container:
+#      image: tensorflow/tensorflow:2.4.1-gpu
+#      options: --gpus 0 --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#    steps:
+#      - name: Launcher docker
+#        uses: actions/checkout@v2
+#
+#      - name: NVIDIA-SMI
+#        run: |
+#          nvidia-smi
+#
+#      - name: Install dependencies
+#        run: |
+#          pip install --upgrade pip
+#          pip install .[sklearn,testing,onnxruntime,sentencepiece]
+#
+#      - name: Are GPUs recognized by our DL frameworks
+#        run: |
+#          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+#          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+#
+#      - name: Run all non-slow tests on GPU
+#        env:
+#          TF_NUM_INTRAOP_THREADS: 8
+#          TF_NUM_INTEROP_THREADS: 1
+#        run: |
+#          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests
+#
+#      - name: Failure short reports
+#        if: ${{ always() }}
+#        run: cat reports/tests_tf_gpu_failures_short.txt
+#
+#      - name: Test suite reports artifacts
+#        if: ${{ always() }}
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: run_all_tests_tf_gpu_test_reports
+#          path: reports


  run_tests_torch_multi_gpu:
@@ -134,7 +135,7 @@ jobs:
        env:
          MKL_SERVICE_FORCE_INTEL: 1
        run: |
-          python -m pytest -n 2 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
+          python -m pytest -n 2 --dist=loadfile -v --make-reports=tests_torch_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -147,47 +148,47 @@ jobs:
          name: run_all_tests_torch_multi_gpu_test_reports
          path: reports

-  run_tests_tf_multi_gpu:
-    runs-on: [self-hosted, docker-gpu, multi-gpu]
-    timeout-minutes: 120
-    container:
-      image: tensorflow/tensorflow:2.4.1-gpu
-      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
-    steps:
-      - name: Launcher docker
-        uses: actions/checkout@v2
-
-      - name: NVIDIA-SMI
-        run: |
-          nvidia-smi
-
-      - name: Install dependencies
-        run: |
-          pip install --upgrade pip
-          pip install .[sklearn,testing,onnxruntime,sentencepiece]
-
-      - name: Are GPUs recognized by our DL frameworks
-        run: |
-          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
-          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
-
-      - name: Run all non-slow tests on GPU
-        env:
-          TF_NUM_INTRAOP_THREADS: 8
-          TF_NUM_INTEROP_THREADS: 1
-        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
-
-      - name: Failure short reports
-        if: ${{ always() }}
-        run: cat reports/tests_tf_multi_gpu_failures_short.txt
-
-      - name: Test suite reports artifacts
-        if: ${{ always() }}
-        uses: actions/upload-artifact@v2
-        with:
-          name: run_all_tests_tf_multi_gpu_test_reports
-          path: reports
+#  run_tests_tf_multi_gpu:
+#    runs-on: [self-hosted, docker-gpu, multi-gpu]
+#    timeout-minutes: 120
+#    container:
+#      image: tensorflow/tensorflow:2.4.1-gpu
+#      options: --gpus all --shm-size "16gb" --ipc host -v /mnt/cache/.cache/huggingface:/mnt/cache/
+#    steps:
+#      - name: Launcher docker
+#        uses: actions/checkout@v2
+#
+#      - name: NVIDIA-SMI
+#        run: |
+#          nvidia-smi
+#
+#      - name: Install dependencies
+#        run: |
+#          pip install --upgrade pip
+#          pip install .[sklearn,testing,onnxruntime,sentencepiece]
+#
+#      - name: Are GPUs recognized by our DL frameworks
+#        run: |
+#          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+#          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+#
+#      - name: Run all non-slow tests on GPU
+#        env:
+#          TF_NUM_INTRAOP_THREADS: 8
+#          TF_NUM_INTEROP_THREADS: 1
+#        run: |
+#          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
+#
+#      - name: Failure short reports
+#        if: ${{ always() }}
+#        run: cat reports/tests_tf_multi_gpu_failures_short.txt
+#
+#      - name: Test suite reports artifacts
+#        if: ${{ always() }}
+#        uses: actions/upload-artifact@v2
+#        with:
+#          name: run_all_tests_tf_multi_gpu_test_reports
+#          path: reports

  run_tests_torch_cuda_extensions_gpu:
    runs-on: [self-hosted, docker-gpu, single-gpu]
@@ -217,7 +218,7 @@ jobs:

      - name: Run all tests on GPU
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+          python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended

      - name: Failure short reports
        if: ${{ always() }}
@@ -258,7 +259,7 @@ jobs:

      - name: Run all tests on GPU
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
+          python -m pytest -n 1 --dist=loadfile -v --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended

      - name: Failure short reports
        if: ${{ always() }}
@@ -278,9 +279,9 @@ jobs:
    if: always()
    needs: [
        run_tests_torch_gpu,
-        run_tests_tf_gpu,
+#        run_tests_tf_gpu,
        run_tests_torch_multi_gpu,
-        run_tests_tf_multi_gpu,
+#        run_tests_tf_multi_gpu,
        run_tests_torch_cuda_extensions_gpu,
        run_tests_torch_cuda_extensions_multi_gpu
    ]
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -14,6 +14,7 @@ env:
  RUN_SLOW: yes
  OMP_NUM_THREADS: 16
  MKL_NUM_THREADS: 16
+  PYTEST_TIMEOUT: 600

 jobs:
  run_all_tests_torch_gpu:
@@ -44,7 +45,7 @@ jobs:

      - name: Run all tests on GPU
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -60,7 +61,7 @@ jobs:
          TRANSFORMERS_IS_CI: yes
        run: |
          pip install -r examples/pytorch/_tests_requirements.txt
-          python -m pytest -n 1 --dist=loadfile --make-reports=examples_torch_gpu examples
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=examples_torch_gpu examples

      - name: Failure short reports
        if: ${{ always() }}
@@ -71,7 +72,7 @@ jobs:
        env:
          RUN_PIPELINE_TESTS: yes
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -112,7 +113,7 @@ jobs:
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -125,7 +126,7 @@ jobs:
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -168,7 +169,7 @@ jobs:
        env:
          MKL_SERVICE_FORCE_INTEL: 1
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_multi_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -179,7 +180,7 @@ jobs:
        env:
          RUN_PIPELINE_TESTS: yes
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -220,7 +221,7 @@ jobs:
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_tf_multi_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_tf_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -233,7 +234,7 @@ jobs:
          TF_NUM_INTEROP_THREADS: 1
          TF_NUM_INTRAOP_THREADS: 16
        run: |
-          python -m pytest -n 1 --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
+          python -m pytest -n 1 -v --dist=loadfile -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
@@ -274,7 +275,7 @@ jobs:

      - name: Run all tests on GPU
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_gpu tests/deepspeed tests/extended

      - name: Failure short reports
        if: ${{ always() }}
@@ -315,7 +316,7 @@ jobs:

      - name: Run all tests on GPU
        run: |
-          python -m pytest -n 1 --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended
+          python -m pytest -n 1 -v --dist=loadfile --make-reports=tests_torch_cuda_extensions_multi_gpu tests/deepspeed tests/extended

      - name: Failure short reports
        if: ${{ always() }}
--- a/7
+++ b/7
@@ -21,6 +21,12 @@ modified_only_fixup:
 deps_table_update:
 	@python setup.py deps_table_update

+deps_table_check_updated:
+	@md5sum src/transformers/dependency_versions_table.py > md5sum.saved
+	@python setup.py deps_table_update
+	@md5sum -c --quiet md5sum.saved || (printf "\nError: the version dependency table is outdated.\nPlease run 'make fixup' or 'make style' and commit the changes.\n\n" && exit 1)
+	@rm md5sum.saved
+
 # autogenerating code

 autogenerate_code: deps_table_update
@@ -34,6 +40,7 @@ extra_quality_checks:
 	python utils/check_dummies.py
 	python utils/check_repo.py
 	python utils/check_inits.py
+	python utils/tests_fetcher.py --sanity_check

 # this target runs checks on all files
 quality:
--- a/README.md
+++ b/README.md
@@ -38,6 +38,14 @@ limitations under the License.
    <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
 </p>

+<h4 align="center">
+    <p>
+        <b>English</b> |
+        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hans.md">简体中文</a> |
+        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hant.md">繁體中文</a>
+    <p>
+</h4>
+
 <h3 align="center">
    <p>State-of-the-art Natural Language Processing for Jax, PyTorch and TensorFlow</p>
 </h3>
@@ -212,7 +220,8 @@ Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://h
 1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
 1. **[ByT5](https://huggingface.co/transformers/model_doc/byt5.html)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
 1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-1. **[CLIP](https://huggingface.co/transformers/model_doc/clip.html)** from (OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[CANINE](https://huggingface.co/transformers/model_doc/canine.html)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[CLIP](https://huggingface.co/transformers/model_doc/clip.html)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
 1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
 1. **[CPM](https://huggingface.co/transformers/model_doc/cpm.html)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
 1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
--- a/README_zh-hans.md
+++ b/README_zh-hans.md
@@ -0,0 +1,334 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!---
+A useful guide for English-Chinese translation of Hugging Face documentation
+- Add space around English words and numbers when they appear between Chinese characters. E.g., 共 100 多种语言; 使用 transformers 库。
+- Use square quotes, e.g.,「引用」
+
+Dictionary
+
+Hugging Face: 抱抱脸
+token: 词符（并用括号标注原英文）
+tokenize: 词符化（并用括号标注原英文）
+tokenizer: 词符化器（并用括号标注原英文）
+transformer: transformer（不翻译）
+pipeline: 流水线
+API: API (不翻译）
+inference: 推理
+Trainer: 训练器。当作为类名出现时不翻译。
+pretrained/pretrain: 预训练
+finetune: 微调
+community: 社区
+example: 当特指仓库中 example 目录时翻译为「用例」
+Python data structures (e.g., list, set, dict): 翻译为列表，集合，词典，并用括号标注原英文
+NLP/Natural Language Processing: 以 NLP 出现时不翻译，以 Natural Language Processing 出现时翻译为自然语言处理
+checkpoint: 检查点
+-->
+
+<p align="center">
+    <br>
+    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
+    <br>
+<p>
+<p align="center">
+    <a href="https://circleci.com/gh/huggingface/transformers">
+        <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
+        <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
+    </a>
+    <a href="https://huggingface.co/transformers/index.html">
+        <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/transformers/index.html.svg?down_color=red&down_message=offline&up_message=online">
+    </a>
+    <a href="https://github.com/huggingface/transformers/releases">
+        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/master/CODE_OF_CONDUCT.md">
+        <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
+    </a>
+    <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
+</p>
+
+<h4 align="center">
+    <p>
+        <a href="https://github.com/huggingface/transformers/">English</a> |
+        <b>简体中文</b> |
+        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hant.md">繁體中文</a>
+    <p>
+</h4>
+
+<h3 align="center">
+    <p>为 Jax、PyTorch 和 TensorFlow 打造的先进的自然语言处理</p>
+</h3>
+
+<h3 align="center">
+    <a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/course_banner.png"></a>
+</h3>
+
+🤗 Transformers 提供了数以千计的预训练模型，支持 100 多种语言的文本分类、信息抽取、问答、摘要、翻译、文本生成。它的宗旨让最先进的 NLP 技术人人易用。
+
+🤗 Transformers 提供了便于快速下载和使用的API，让你可以把预训练模型用在给定文本、在你的数据集上微调然后通过 [model hub](https://huggingface.co/models) 与社区共享。同时，每个定义的 Python 模块均完全独立，方便修改和快速研究实验。
+
+🤗 Transformers 支持三个最热门的深度学习库： [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/) — 并与之无缝整合。你可以直接使用一个框架训练你的模型然后用另一个加载和推理。
+
+## 在线演示
+
+你可以直接在模型页面上测试大多数 [model hub](https://huggingface.co/models) 上的模型。 我们也提供了 [私有模型托管、模型版本管理以及推理API](https://huggingface.co/pricing)。
+
+这里是一些例子：
+- [用 BERT 做掩码填词](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [用 Electra 做命名实体识别](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [用 GPT-2 做文本生成](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
+- [用 RoBERTa 做自然语言推理](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [用 BART 做文本摘要](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [用 DistilBERT 做问答](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [用 T5 做翻译](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+
+**[Write With Transformer](https://transformer.huggingface.co)**，由抱抱脸团队打造，是一个文本生成的官方 demo。
+
+## 如果你在寻找由抱抱脸团队提供的定制化支持服务
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a><br>
+
+## 快速上手
+
+我们为快速使用模型提供了 `pipeline` （流水线）API。流水线聚合了预训练模型和对应的文本预处理。下面是一个快速使用流水线去判断正负面情绪的例子：
+
+```python
+>>> from transformers import pipeline
+
+# 使用情绪分析流水线
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
+```
+
+第二行代码下载并缓存了流水线使用的预训练模型，而第三行代码则在给定的文本上进行了评估。这里的答案“正面” (positive) 具有 99 的置信度。
+
+许多的 NLP 任务都有开箱即用的预训练流水线。比如说，我们可以轻松的从给定文本中抽取问题答案：
+
+``` python
+>>> from transformers import pipeline
+
+# 使用问答流水线
+>>> question_answerer = pipeline('question-answering')
+>>> question_answerer({
+...     'question': 'What is the name of the repository ?',
+...     'context': 'Pipeline has been included in the huggingface/transformers repository'
+... })
+{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}
+
+```
+
+除了给出答案，预训练模型还给出了对应的置信度分数、答案在词符化 (tokenized) 后的文本中开始和结束的位置。你可以从[这个教程](https://huggingface.co/transformers/task_summary.html)了解更多流水线API支持的任务。
+
+要在你的任务上下载和使用任意预训练模型也很简单，只需三行代码。这里是 PyTorch 版的示例：
+```python
+>>> from transformers import AutoTokenizer, AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = AutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+这里是等效的 TensorFlow 代码：
+```python
+>>> from transformers import AutoTokenizer, TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="tf")
+>>> outputs = model(**inputs)
+```
+
+词符化器 (tokenizer) 为所有的预训练模型提供了预处理，并可以直接对单个字符串进行调用（比如上面的例子）或对列表 (list) 调用。它会输出一个你可以在下游代码里使用或直接通过 `**` 解包表达式传给模型的词典 (dict)。
+
+模型本身是一个常规的 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 或 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)（取决于你的后端），可以常规方式使用。 [这个教程](https://huggingface.co/transformers/training.html)解释了如何将这样的模型整合到经典的 PyTorch 或 TensorFlow 训练循环中，或是如何使用我们的 `Trainer` 训练器）API 来在一个新的数据集上快速微调。
+
+## 为什么要用 transformers？
+
+1. 便于使用的先进模型：
+    - NLU 和 NLG 上表现优越
+    - 对教学和实践友好且低门槛
+    - 高级抽象，只需了解三个类
+    - 对所有模型统一的API
+
+1. 更低计算开销，更少的碳排放：
+    - 研究人员可以分享亿训练的模型而非次次从头开始训练
+    - 工程师可以减少计算用时和生产环境开销
+    - 数十种模型架构、两千多个预训练模型、100多种语言支持
+
+1. 对于模型生命周期的每一个部分都面面俱到：
+    - 训练先进的模型，只需 3 行代码
+    - 模型在不同深度学习框架间任意转移，随你心意
+    - 为训练、评估和生产选择最适合的框架，衔接无缝
+
+1. 为你的需求轻松定制专属模型和用例：
+    - 我们为每种模型架构提供了多个用例来复现原论文结果
+    - 模型内部结构保持透明一致
+    - 模型文件可单独使用，方便魔改和快速实验
+
+## 什么情况下我不该用 transformers？
+
+- 本库并不是模块化的神经网络工具箱。模型文件中的代码特意呈若璞玉，未经额外抽象封装，以便研究人员快速迭代魔改而不致溺于抽象和文件跳转之中。
+- `Trainer` API 并非兼容任何模型，只为本库之模型优化。若是在寻找适用于通用机器学习的训练循环实现，请另觅他库。
+- 尽管我们已尽力而为，[examples 目录](https://github.com/huggingface/transformers/tree/master/examples)中的脚本也仅为用例而已。对于你的特定问题，它们并不一定开箱即用，可能需要改几行代码以适之。
+
+## 安装
+
+### 使用 pip
+
+这个仓库已在 Python 3.6+、Flax 0.3.2+、PyTorch 1.3.1+ 和 TensorFlow 2.3+ 下经过测试。
+
+你可以在[虚拟环境](https://docs.python.org/3/library/venv.html)中安装 🤗 Transformers。如果你还不熟悉 Python 的虚拟环境，请阅此[用户说明](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。
+
+首先，用你打算使用的版本的 Python 创建一个虚拟环境并激活。
+
+然后，你需要安装 Flax、PyTorch 或 TensorFlow 其中之一。关于在你使用的平台上安装这些框架，请参阅 [TensorFlow 安装页](https://www.tensorflow.org/install/), [PyTorch 安装页](https://pytorch.org/get-started/locally/#start-locally) 或 [Flax 安装页](https://github.com/google/flax#quick-install)。
+
+当这些后端之一安装成功后， 🤗 Transformers 可依此安装：
+
+```bash
+pip install transformers
+```
+
+如果你想要试试用例或者想在正式发布前使用最新的开发中代码，你得[从源代码安装](https://huggingface.co/transformers/installation.html#installing-from-source)。
+
+### 使用 conda
+
+自 Transformers 4.0.0 版始，我们有了一个 conda 频道： `huggingface`。
+
+🤗 Transformers 可以通过 conda 依此安装：
+
+```shell script
+conda install -c huggingface transformers
+```
+
+要通过 conda 安装 Flax、PyTorch 或 TensorFlow 其中之一，请参阅它们各自安装页的说明。
+
+## 模型架构
+
+**🤗 Transformers 支持的[所有的模型检查点](https://huggingface.co/models)** 由[用户](https://huggingface.co/users)和[组织](https://huggingface.co/organizations)上传，均与 huggingface.co [model hub](https://huggingface.co) 无缝整合。
+
+目前的检查点数量： ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
+🤗 Transformers 目前支持如下的架构（模型概述请阅[这里](https://huggingface.co/transformers/model_summary.html)）：
+
+1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (来自 Google Research and the Toyota Technological Institute at Chicago) 伴随论文 [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), 由 Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut 发布。
+1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (来自 Facebook) 伴随论文 [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) 由 Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer 发布。
+1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (来自 École polytechnique) 伴随论文 [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) 由 Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis 发布。
+1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (来自 Google) 伴随论文 [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) 由 Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova 发布。
+1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (来自 Google) 伴随论文 [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) 由 Sascha Rothe, Shashi Narayan, Aliaksei Severyn 发布。
+1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
+1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (来自 Google Research) 伴随论文 [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) 由 Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed 发布。
+1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
+1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (来自 Facebook) 伴随论文 [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) 由 Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston 发布。
+1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (来自 Alexa) 伴随论文 [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) 由 Adrian de Wynter and Daniel J. Perry 发布。
+1. **[ByT5](https://huggingface.co/transformers/model_doc/byt5.html)** (来自 Google Research) 伴随论文 [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) 由 Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel 发布。
+1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (来自 Inria/Facebook/Sorbonne) 伴随论文 [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) 由 Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot 发布。
+1. **[CANINE](https://huggingface.co/transformers/model_doc/canine.html)** (来自 Google Research) 伴随论文 [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) 由 Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting 发布。
+1. **[CLIP](https://huggingface.co/transformers/model_doc/clip.html)** (来自 OpenAI) 伴随论文 [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) 由 Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever 发布。
+1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (来自 YituTech) 伴随论文 [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) 由 Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan 发布。
+1. **[CPM](https://huggingface.co/transformers/model_doc/cpm.html)** (来自 Tsinghua University) 伴随论文 [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) 由 Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun 发布。
+1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (来自 Salesforce) 伴随论文 [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) 由 Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher 发布。
+1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
+1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (来自 Microsoft) 伴随论文 [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) 由 Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen 发布。
+1. **[DeiT](https://huggingface.co/transformers/model_doc/deit.html)** (来自 Facebook) 伴随论文 [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) 由 Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou 发布。
+1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (来自 Facebook) 伴随论文 [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) 由 Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko 发布。
+1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (来自 Microsoft Research) 伴随论文 [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) 由 Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan 发布。
+1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (来自 HuggingFace), 伴随论文 [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) 由 Victor Sanh, Lysandre Debut and Thomas Wolf 发布。 同样的方法也应用于压缩 GPT-2 到 [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa 到 [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT 到 [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) 和德语版 DistilBERT。
+1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (来自 Facebook) 伴随论文 [Dense Passage Retrieval
+for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) 由 Vladimir Karpukhin, Barlas Oğuz, Sewon
+Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih 发布。
+1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (来自 Google Research/Stanford University) 伴随论文 [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) 由 Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning 发布。
+1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (来自 CNRS) 伴随论文 [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) 由 Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab 发布。
+1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (来自 CMU/Google Brain) 伴随论文 [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) 由 Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le 发布。
+1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (来自 OpenAI) 伴随论文 [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) 由 Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever 发布。
+1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (来自 OpenAI) 伴随论文 [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) 由 Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever** 发布。
+1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (来自 EleutherAI) 随仓库 [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) 发布。作者为 Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy 发布。
+1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (来自 Facebook) 伴随论文 [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) 由 Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed 发布。
+1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (来自 Berkeley) 伴随论文 [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) 由 Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer 发布。
+1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (来自 Microsoft Research Asia) 伴随论文 [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) 由 Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou 发布。
+1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
+1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (来自 AllenAI) 伴随论文 [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) 由 Iz Beltagy, Matthew E. Peters, Arman Cohan 发布。
+1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (来自 Studio Ousia) 伴随论文 [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) 由 Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto 发布。
+1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (来自 UNC Chapel Hill) 伴随论文 [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) 由 Hao Tan and Mohit Bansal 发布。
+1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (来自 Facebook) 伴随论文 [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) 由 Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin 发布。
+1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** 用 [OPUS](http://opus.nlpl.eu/) 数据训练的机器翻译模型由 Jörg Tiedemann 发布。[Marian Framework](https://marian-nmt.github.io/) 由微软翻译团队开发。
+1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (来自 Facebook) 伴随论文 [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) 由 Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer 发布。
+1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (来自 Facebook) 伴随论文 [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) 由 Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan 发布。
+1. **[Megatron-BERT](https://huggingface.co/transformers/model_doc/megatron_bert.html)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
+1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (来自 NVIDIA) 伴随论文 [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) 由 Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro 发布。
+1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (来自 Microsoft Research) 伴随论文 [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) 由 Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu 发布。
+1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (来自 Google AI) 伴随论文 [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) 由 Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel 发布。
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (来自 Google) 伴随论文 [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> 由 Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu 发布。
+1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
+1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (来自 Google Research) 伴随论文 [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) 由 Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya 发布。
+1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (来自 Facebook), 伴随论文 [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) 由 Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov 发布。
+1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (来自 ZhuiyiTechnology), 伴随论文 [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) 由 Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu 发布。
+1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (来自 Facebook), 伴随论文 [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) 由 Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino 发布。
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** 伴随论文 [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) 由 Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer 发布。
+1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (来自 Google AI) 伴随论文 [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) 由 Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu 发布。
+1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (来自 Google AI) 伴随论文 [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) 由 Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos 发布。
+1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (来自 Google/CMU) 伴随论文 [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) 由 Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov 发布。
+1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (来自 Google AI) 伴随论文 [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) 由 Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby 发布。
+1. **[VisualBERT](https://huggingface.co/transformers/model_doc/visual_bert.html)** (来自 UCLA NLP) 伴随论文 [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) 由 Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang 发布。
+1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (来自 Facebook AI) 伴随论文 [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) 由 Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli 发布。
+1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (来自 Facebook) 伴随论文 [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) 由 Guillaume Lample and Alexis Conneau 发布。
+1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (来自 Microsoft Research) 伴随论文 [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) 由 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou 发布。
+1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (来自 Facebook AI), 伴随论文 [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) 由 Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov 发布。
+1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (来自 Google/CMU) 伴随论文 [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) 由 Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le 发布。
+1. **[XLSR-Wav2Vec2](https://huggingface.co/transformers/model_doc/xlsr_wav2vec2.html)** (来自 Facebook AI) 伴随论文 [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) 由 Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli 发布。
+1. 想要贡献新的模型？我们这里有一份**详细指引和模板**来引导你添加新的模型。你可以在 [`templates`](./templates) 目录中找到他们。记得查看 [贡献指南](./CONTRIBUTING.md) 并在开始写 PR 前联系维护人员或开一个新的 issue 来获得反馈。
+
+要检查某个模型是否已有 Flax、PyTorch 或 TensorFlow 的实现，或其是否在 🤗 Tokenizers 库中有对应词符化器（tokenizer），敬请参阅[此表](https://huggingface.co/transformers/index.html#supported-frameworks)。
+
+这些实现均已于多个数据集测试（请参看用例脚本）并应于原版实现表现相当。你可以在用例文档的[此节](https://huggingface.co/transformers/examples.html)中了解表现的细节。
+
+
+## 了解更多
+
+| 章节 | 描述 |
+|-|-|
+| [文档](https://huggingface.co/transformers/) | 完整的 API 文档和教程 |
+| [任务总结](https://huggingface.co/transformers/task_summary.html) | 🤗 Transformers 支持的任务 |
+| [预处理教程](https://huggingface.co/transformers/preprocessing.html) | 使用 `Tokenizer` 来为模型准备数据 |
+| [训练和微调](https://huggingface.co/transformers/training.html) | 在 PyTorch/TensorFlow 的训练循环或 `Trainer` API 中使用 🤗 Transformers 提供的模型 |
+| [快速上手：微调和用例脚本](https://github.com/huggingface/transformers/tree/master/examples) | 为各种任务提供的用例脚本 |
+| [模型分享和上传](https://huggingface.co/transformers/model_sharing.html) | 和社区上传和分享你微调的模型 |
+| [迁移](https://huggingface.co/transformers/migration.html) | 从 `pytorch-transformers` 或 `pytorch-pretrained-bert` 迁移到 🤗 Transformers |
+
+## 引用
+
+我们已将此库的[论文](https://www.aclweb.org/anthology/2020.emnlp-demos.6/)正式发表，如果你使用了 🤗 Transformers 库，请引用:
+```bibtex
+@inproceedings{wolf-etal-2020-transformers,
+    title = "Transformers: State-of-the-Art Natural Language Processing",
+    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = oct,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
+    pages = "38--45"
+}
+```
--- a/README_zh-hant.md
+++ b/README_zh-hant.md
@@ -0,0 +1,346 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+<!---
+A useful guide for English-Traditional Chinese translation of Hugging Face documentation
+- Add space around English words and numbers when they appear between Chinese characters. E.g., 共 100 多種語言; 使用 transformers 函式庫。
+- Use square quotes, e.g.,「引用」
+- Some of terms in the file can be found at National Academy for Educational Research (https://terms.naer.edu.tw/), an official website providing bilingual translations between English and Traditional Chinese.
+
+Dictionary
+
+API: API (不翻譯）
+add: 加入
+checkpoint: 檢查點
+code: 程式碼
+community: 社群
+confidence: 信賴度
+dataset: 資料集
+documentation: 文件
+example: 基本翻譯為「範例」，或依語意翻為「例子」
+finetune: 微調
+Hugging Face: Hugging Face（不翻譯）
+implementation: 實作
+inference: 推論
+library: 函式庫
+module: 模組
+NLP/Natural Language Processing: 以 NLP 出現時不翻譯，以 Natural Language Processing 出現時翻譯為自然語言處理
+online demos: 線上Demo
+pipeline: pipeline（不翻譯）
+pretrained/pretrain: 預訓練
+Python data structures (e.g., list, set, dict): 翻譯為串列，集合，字典，並用括號標註原英文
+repository: repository（不翻譯）
+summary: 概覽
+token-: token-（不翻譯）
+Trainer: Trainer（不翻譯）
+transformer: transformer（不翻譯）
+tutorial: 教學
+user: 使用者
+-->
+
+<p align="center">
+    <br>
+    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
+    <br>
+<p>
+<p align="center">
+    <a href="https://circleci.com/gh/huggingface/transformers">
+        <img alt="Build" src="https://img.shields.io/circleci/build/github/huggingface/transformers/master">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/master/LICENSE">
+        <img alt="GitHub" src="https://img.shields.io/github/license/huggingface/transformers.svg?color=blue">
+    </a>
+    <a href="https://huggingface.co/transformers/index.html">
+        <img alt="Documentation" src="https://img.shields.io/website/http/huggingface.co/transformers/index.html.svg?down_color=red&down_message=offline&up_message=online">
+    </a>
+    <a href="https://github.com/huggingface/transformers/releases">
+        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
+    </a>
+    <a href="https://github.com/huggingface/transformers/blob/master/CODE_OF_CONDUCT.md">
+        <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
+    </a>
+    <a href="https://zenodo.org/badge/latestdoi/155220641"><img src="https://zenodo.org/badge/155220641.svg" alt="DOI"></a>
+</p>
+
+<h4 align="center">
+    <p>
+        <a href="https://github.com/huggingface/transformers/">English</a> |
+        <a href="https://github.com/huggingface/transformers/blob/master/README_zh-hans.md">简体中文</a> |
+        <b>繁體中文</b>
+    <p>
+</h4>
+
+<h3 align="center">
+    <p>為 Jax、PyTorch 以及 TensorFlow 打造的先進自然語言處理函式庫</p>
+</h3>
+
+<h3 align="center">
+    <a href="https://hf.co/course"><img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/course_banner.png"></a>
+</h3>
+
+🤗 Transformers 提供了數以千計的預訓練模型，支援 100 多種語言的文本分類、資訊擷取、問答、摘要、翻譯、文本生成。它的宗旨是讓最先進的 NLP 技術人人易用。
+
+🤗 Transformers 提供了便於快速下載和使用的API，讓你可以將預訓練模型用在給定文本、在你的資料集上微調然後經由 [model hub](https://huggingface.co/models) 與社群共享。同時，每個定義的 Python 模組架構均完全獨立，方便修改和快速研究實驗。
+
+🤗 Transformers 支援三個最熱門的深度學習函式庫： [Jax](https://jax.readthedocs.io/en/latest/), [PyTorch](https://pytorch.org/) 以及 [TensorFlow](https://www.tensorflow.org/) — 並與之完美整合。你可以直接使用其中一個框架訓練你的模型，然後用另一個載入和推論。
+
+## 線上Demo
+
+你可以直接在 [model hub](https://huggingface.co/models) 上測試大多數的模型。我們也提供了 [私有模型託管、模型版本管理以及推論API](https://huggingface.co/pricing)。
+
+這裡是一些範例：
+- [用 BERT 做遮蓋填詞](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [用 Electra 做專有名詞辨識](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [用 GPT-2 做文本生成](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
+- [用 RoBERTa 做自然語言推論](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [用 BART 做文本摘要](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [用 DistilBERT 做問答](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [用 T5 做翻譯](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)
+
+**[Write With Transformer](https://transformer.huggingface.co)**，由 Hugging Face 團隊所打造，是一個文本生成的官方 demo。
+
+## 如果你在尋找由 Hugging Face 團隊所提供的客製化支援服務
+
+<a target="_blank" href="https://huggingface.co/support">
+    <img alt="HuggingFace Expert Acceleration Program" src="https://huggingface.co/front/thumbnails/support.png" style="max-width: 600px; border: 1px solid #eee; border-radius: 4px; box-shadow: 0 1px 2px 0 rgba(0, 0, 0, 0.05);">
+</a><br>
+
+## 快速上手
+
+我們為快速使用模型提供了 `pipeline` API。 Pipeline 包含了預訓練模型和對應的文本預處理。下面是一個快速使用 pipeline 去判斷正負面情緒的例子：
+
+```python
+>>> from transformers import pipeline
+
+# 使用情緒分析 pipeline
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to introduce pipeline to the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9996980428695679}]
+```
+
+第二行程式碼下載並快取 pipeline 使用的預訓練模型，而第三行程式碼則在給定的文本上進行了評估。這裡的答案“正面” (positive) 具有 99.97% 的信賴度。
+
+許多的 NLP 任務都有隨選即用的預訓練 `pipeline`。例如，我們可以輕鬆地從給定文本中擷取問題答案：
+
+``` python
+>>> from transformers import pipeline
+
+# 使用問答 pipeline
+>>> question_answerer = pipeline('question-answering')
+>>> question_answerer({
+...     'question': 'What is the name of the repository ?',
+...     'context': 'Pipeline has been included in the huggingface/transformers repository'
+... })
+{'score': 0.30970096588134766, 'start': 34, 'end': 58, 'answer': 'huggingface/transformers'}
+
+```
+
+除了提供問題解答，預訓練模型還提供了對應的信賴度分數以及解答在 tokenized 後的文本中開始和結束的位置。你可以從[這個教學](https://huggingface.co/transformers/task_summary.html)了解更多 `pipeline` API支援的任務。
+
+要在你的任務中下載和使用任何預訓練模型很簡單，只需三行程式碼。這裡是 PyTorch 版的範例：
+```python
+>>> from transformers import AutoTokenizer, AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = AutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+這裡是對應的 TensorFlow 程式碼：
+```python
+>>> from transformers import AutoTokenizer, TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="tf")
+>>> outputs = model(**inputs)
+```
+
+Tokenizer 為所有的預訓練模型提供了預處理，並可以直接轉換單一字串（比如上面的例子）或串列 (list)。它會輸出一個的字典 (dict) 讓你可以在下游程式碼裡使用或直接藉由 `**` 運算式傳給模型。
+
+模型本身是一個常規的 [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) 或 [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model)（取決於你的後端），可依常規方式使用。 [這個教學](https://huggingface.co/transformers/training.html)解釋了如何將這樣的模型整合到一般的 PyTorch 或 TensorFlow 訓練迴圈中，或是如何使用我們的 `Trainer` API 在一個新的資料集上快速進行微調。
+
+## 為什麼要用 transformers？
+
+1. 便於使用的先進模型：
+    - NLU 和 NLG 上性能卓越
+    - 對教學和實作友好且低門檻
+    - 高度抽象，使用者只須學習 3 個類別
+    - 對所有模型使用的制式化API
+
+1. 更低的運算成本，更少的碳排放：
+    - 研究人員可以分享預訓練的模型而非從頭開始訓練
+    - 工程師可以減少計算時間以及生產成本
+    - 數十種模型架構、兩千多個預訓練模型、100多種語言支援
+
+1. 對於模型生命週期的每一個部分都面面俱到：
+    - 訓練先進的模型，只需 3 行程式碼
+    - 模型可以在不同深度學習框架之間任意轉換
+    - 為訓練、評估和生產選擇最適合的框架，並完美銜接
+
+1. 為你的需求輕鬆客製化專屬模型和範例：
+    - 我們為每種模型架構提供了多個範例來重現原論文結果
+    - 一致的模型內部架構
+    - 模型檔案可單獨使用，便於修改和快速實驗
+
+## 什麼情況下我不該用 transformers？
+
+- 本函式庫並不是模組化的神經網絡工具箱。模型文件中的程式碼並未做額外的抽象封裝，以便研究人員快速地翻閱及修改程式碼，而不會深陷複雜的類別包裝之中。
+- `Trainer` API 並非相容任何模型，它只為本函式庫中的模型最佳化。對於一般的機器學習用途，請使用其他函式庫。
+- 儘管我們已盡力而為，[examples 目錄](https://github.com/huggingface/transformers/tree/master/examples)中的腳本也僅為範例而已。對於特定問題，它們並不一定隨選即用，可能需要修改幾行程式碼以符合需求。
+
+## 安裝
+
+### 使用 pip
+
+這個 Repository 已在 Python 3.6+、Flax 0.3.2+、PyTorch 1.3.1+ 和 TensorFlow 2.3+ 下經過測試。
+
+你可以在[虛擬環境](https://docs.python.org/3/library/venv.html)中安裝 🤗 Transformers。如果你還不熟悉 Python 的虛擬環境，請閱此[使用者指引](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/)。
+
+首先，用你打算使用的版本的 Python 創建一個虛擬環境並進入。
+
+然後，你需要安裝 Flax、PyTorch 或 TensorFlow 其中之一。對於該如何在你使用的平台上安裝這些框架，請參閱 [TensorFlow 安裝頁面](https://www.tensorflow.org/install/), [PyTorch 安裝頁面](https://pytorch.org/get-started/locally/#start-locally) 或 [Flax 安裝頁面](https://github.com/google/flax#quick-install)。
+
+當其中一個後端安裝成功後，🤗 Transformers 可依此安裝：
+
+```bash
+pip install transformers
+```
+
+如果你想要試試範例或者想在正式發布前使用最新開發中的程式碼，你必須[從原始碼安裝](https://huggingface.co/transformers/installation.html#installing-from-source)。
+
+### 使用 conda
+
+自 Transformers 4.0.0 版始，我們有了一個 conda channel： `huggingface`。
+
+🤗 Transformers 可以藉由 conda 依此安裝：
+
+```shell script
+conda install -c huggingface transformers
+```
+
+要藉由 conda 安裝 Flax、PyTorch 或 TensorFlow 其中之一，請參閱它們各自安裝頁面的說明。
+
+## 模型架構
+
+**🤗 Transformers 支援的[所有的模型檢查點](https://huggingface.co/models)**，由[使用者](https://huggingface.co/users)和[組織](https://huggingface.co/organizations)上傳，均與 huggingface.co [model hub](https://huggingface.co) 完美結合。
+
+目前的檢查點數量： ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
+🤗 Transformers 目前支援以下的架構（模型概覽請參閱[這裡](https://huggingface.co/transformers/model_summary.html)）：
+
+1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
+1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[BigBird-RoBERTa](https://huggingface.co/transformers/model_doc/bigbird.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[BigBird-Pegasus](https://huggingface.co/transformers/model_doc/bigbird_pegasus.html)** (from Google Research) released with the paper [Big Bird: Transformers for Longer Sequences](https://arxiv.org/abs/2007.14062) by Manzil Zaheer, Guru Guruganesh, Avinava Dubey, Joshua Ainslie, Chris Alberti, Santiago Ontanon, Philip Pham, Anirudh Ravula, Qifan Wang, Li Yang, Amr Ahmed.
+1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BORT](https://huggingface.co/transformers/model_doc/bort.html)** (from Alexa) released with the paper [Optimal Subarchitecture Extraction For BERT](https://arxiv.org/abs/2010.10499) by Adrian de Wynter and Daniel J. Perry.
+1. **[ByT5](https://huggingface.co/transformers/model_doc/byt5.html)** (from Google Research) released with the paper [ByT5: Towards a token-free future with pre-trained byte-to-byte models](https://arxiv.org/abs/2105.13626) by Linting Xue, Aditya Barua, Noah Constant, Rami Al-Rfou, Sharan Narang, Mihir Kale, Adam Roberts, Colin Raffel.
+1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CANINE](https://huggingface.co/transformers/model_doc/canine.html)** (from Google Research) released with the paper [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874) by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting.
+1. **[CLIP](https://huggingface.co/transformers/model_doc/clip.html)** (from OpenAI) released with the paper [Learning Transferable Visual Models From Natural Language Supervision](https://arxiv.org/abs/2103.00020) by Alec Radford, Jong Wook Kim, Chris Hallacy, Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen Krueger, Ilya Sutskever.
+1. **[ConvBERT](https://huggingface.co/transformers/model_doc/convbert.html)** (from YituTech) released with the paper [ConvBERT: Improving BERT with Span-based Dynamic Convolution](https://arxiv.org/abs/2008.02496) by Zihang Jiang, Weihao Yu, Daquan Zhou, Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
+1. **[CPM](https://huggingface.co/transformers/model_doc/cpm.html)** (from Tsinghua University) released with the paper [CPM: A Large-scale Generative Chinese Pre-trained Language Model](https://arxiv.org/abs/2012.00413) by Zhengyan Zhang, Xu Han, Hao Zhou, Pei Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng, Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang, Juanzi Li, Xiaoyan Zhu, Maosong Sun.
+1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeBERTa-v2](https://huggingface.co/transformers/model_doc/deberta_v2.html)** (from Microsoft) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DeiT](https://huggingface.co/transformers/model_doc/deit.html)** (from Facebook) released with the paper [Training data-efficient image transformers & distillation through attention](https://arxiv.org/abs/2012.12877) by Hugo Touvron, Matthieu Cord, Matthijs Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
+1. **[DETR](https://huggingface.co/transformers/model_doc/detr.html)** (from Facebook) released with the paper [End-to-End Object Detection with Transformers](https://arxiv.org/abs/2005.12872) by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier, Alexander Kirillov, Sergey Zagoruyko.
+1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
+1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
+for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
+Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[GPT Neo](https://huggingface.co/transformers/model_doc/gpt_neo.html)** (from EleutherAI) released in the repository [EleutherAI/gpt-neo](https://github.com/EleutherAI/gpt-neo) by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
+1. **[Hubert](https://huggingface.co/transformers/model_doc/hubert.html)** (from Facebook) released with the paper [HuBERT: Self-Supervised Speech Representation Learning by Masked Prediction of Hidden Units](https://arxiv.org/abs/2106.07447) by Wei-Ning Hsu, Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
+1. **[I-BERT](https://huggingface.co/transformers/model_doc/ibert.html)** (from Berkeley) released with the paper [I-BERT: Integer-only BERT Quantization](https://arxiv.org/abs/2101.01321) by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
+1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LUKE](https://huggingface.co/transformers/model_doc/luke.html)** (from Studio Ousia) released with the paper [LUKE: Deep Contextualized Entity Representations with Entity-aware Self-attention](https://arxiv.org/abs/2010.01057) by Ikuya Yamada, Akari Asai, Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
+1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[M2M100](https://huggingface.co/transformers/model_doc/m2m_100.html)** (from Facebook) released with the paper [Beyond English-Centric Multilingual Machine Translation](https://arxiv.org/abs/2010.11125) by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
+1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[MBart-50](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Translation with Extensible Multilingual Pretraining and Finetuning](https://arxiv.org/abs/2008.00401) by Yuqing Tang, Chau Tran, Xian Li, Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
+1. **[Megatron-BERT](https://huggingface.co/transformers/model_doc/megatron_bert.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[Megatron-GPT2](https://huggingface.co/transformers/model_doc/megatron_gpt2.html)** (from NVIDIA) released with the paper [Megatron-LM: Training Multi-Billion Parameter Language Models Using Model Parallelism](https://arxiv.org/abs/1909.08053) by Mohammad Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
+1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+1. **[RoFormer](https://huggingface.co/transformers/model_doc/roformer.html)** (from ZhuiyiTechnology), released together with the paper a [RoFormer: Enhanced Transformer with Rotary Position Embedding](https://arxiv.org/pdf/2104.09864v1.pdf) by Jianlin Su and Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
+1. **[SpeechToTextTransformer](https://huggingface.co/transformers/model_doc/speech_to_text.html)** (from Facebook), released together with the paper [fairseq S2T: Fast Speech-to-Text Modeling with fairseq](https://arxiv.org/abs/2010.05171) by Changhan Wang, Yun Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[Vision Transformer (ViT)](https://huggingface.co/transformers/model_doc/vit.html)** (from Google AI) released with the paper [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929) by Alexey Dosovitskiy, Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
+1. **[VisualBERT](https://huggingface.co/transformers/model_doc/visual_bert.html)** (from UCLA NLP) released with the paper [VisualBERT: A Simple and Performant Baseline for Vision and Language](https://arxiv.org/pdf/1908.03557) by Liunian Harold Li, Mark Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
+1. **[Wav2Vec2](https://huggingface.co/transformers/model_doc/wav2vec2.html)** (from Facebook AI) released with the paper [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael Auli.
+1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. **[XLSR-Wav2Vec2](https://huggingface.co/transformers/model_doc/xlsr_wav2vec2.html)** (from Facebook AI) released with the paper [Unsupervised Cross-Lingual Representation Learning For Speech Recognition](https://arxiv.org/abs/2006.13979) by Alexis Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.
+1. 想要貢獻新的模型？我們這裡有一份**詳細指引和模板**來引導你加入新的模型。你可以在 [`templates`](./templates) 目錄中找到它們。記得查看[貢獻指引](./CONTRIBUTING.md)並在開始寫 PR 前聯繫維護人員或開一個新的 issue 來獲得 feedbacks。
+
+要檢查某個模型是否已有 Flax、PyTorch 或 TensorFlow 的實作，或其是否在🤗 Tokenizers 函式庫中有對應的 tokenizer，敬請參閱[此表](https://huggingface.co/transformers/index.html#supported-frameworks)。
+
+這些實作均已於多個資料集測試（請參閱範例腳本）並應與原版實作表現相當。你可以在範例文件的[此節](https://huggingface.co/transformers/examples.html)中了解實作的細節。
+
+
+## 了解更多
+
+| 章節 | 描述 |
+|-|-|
+| [文件](https://huggingface.co/transformers/) | 完整的 API 文件和教學 |
+| [任務概覽](https://huggingface.co/transformers/task_summary.html) | 🤗 Transformers 支援的任務 |
+| [預處理教學](https://huggingface.co/transformers/preprocessing.html) | 使用 `Tokenizer` 來為模型準備資料 |
+| [訓練和微調](https://huggingface.co/transformers/training.html) | 使用 PyTorch/TensorFlow 的內建的訓練方式或於 `Trainer` API 中使用 🤗 Transformers 提供的模型 |
+| [快速上手：微調和範例腳本](https://github.com/huggingface/transformers/tree/master/examples) | 為各種任務提供的範例腳本 |
+| [模型分享和上傳](https://huggingface.co/transformers/model_sharing.html) | 上傳並與社群分享你微調的模型 |
+| [遷移](https://huggingface.co/transformers/migration.html) | 從 `pytorch-transformers` 或 `pytorch-pretrained-bert` 遷移到 🤗 Transformers |
+
+## 引用
+
+我們已將此函式庫的[論文](https://www.aclweb.org/anthology/2020.emnlp-demos.6/)正式發表。如果你使用了 🤗 Transformers 函式庫，可以引用：
+```bibtex
+@inproceedings{wolf-etal-2020-transformers,
+    title = "Transformers: State-of-the-Art Natural Language Processing",
+    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = oct,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
+    pages = "38--45"
+}
+```
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,10 +1,12 @@
 // These two things need to be updated at each release for the version selector.
 // Last stable version
-const stableVersion = "v4.7.0"
+const stableVersion = "v4.9.0"
 // Dictionary doc folder to label. The last stable version should have an empty key.
 const versionMapping = {
    "master": "master",
-    "": "v4.7.0 (stable)",
+    "": "v4.9.0 (stable)",
+    "v4.8.2": "v4.8.0/v4.8.1/v4.8.2",
+    "v4.7.0": "v4.7.0",
    "v4.6.0": "v4.6.0",
    "v4.5.1": "v4.5.0/v4.5.1",
    "v4.4.2": "v4.4.0/v4.4.1/v4.4.2",
--- a/docs/source/debugging.rst
+++ b/docs/source/debugging.rst
@@ -24,7 +24,11 @@ Underflow and Overflow Detection

 .. note::

-   This feature can be used with any ``nn.Module``-based model
+   For multi-GPU training it requires DDP (``torch.distributed.launch``).
+
+.. note::
+
+   This feature can be used with any ``nn.Module``-based model.

 If you start getting ``loss=NaN`` or the model inhibits some other abnormal behavior due to ``inf`` or ``nan`` in
 activations or weights one needs to discover where the first underflow or overflow happens and what led to it. Luckily
--- a/docs/source/imgs/parallelism-deepspeed-3d.png
+++ b/docs/source/imgs/parallelism-deepspeed-3d.png
--- a/docs/source/imgs/parallelism-flexflow.jpeg
+++ b/docs/source/imgs/parallelism-flexflow.jpeg
--- a/docs/source/imgs/parallelism-gpipe-bubble.png
+++ b/docs/source/imgs/parallelism-gpipe-bubble.png
--- a/docs/source/imgs/parallelism-sagemaker-interleaved-pipeline.png
+++ b/docs/source/imgs/parallelism-sagemaker-interleaved-pipeline.png
--- a/docs/source/imgs/parallelism-tp-independent-gelu.png
+++ b/docs/source/imgs/parallelism-tp-independent-gelu.png
--- a/docs/source/imgs/parallelism-tp-parallel_gemm.png
+++ b/docs/source/imgs/parallelism-tp-parallel_gemm.png
--- a/docs/source/imgs/parallelism-tp-parallel_self_attention.png
+++ b/docs/source/imgs/parallelism-tp-parallel_self_attention.png
--- a/docs/source/imgs/parallelism-tp-parallel_shard_processing.png
+++ b/docs/source/imgs/parallelism-tp-parallel_shard_processing.png
--- a/docs/source/imgs/parallelism-zero-dp-pp.png
+++ b/docs/source/imgs/parallelism-zero-dp-pp.png
--- a/docs/source/imgs/parallelism-zero.png
+++ b/docs/source/imgs/parallelism-zero.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -131,158 +131,161 @@ Supported models
 12. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
    French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
    Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-13. :doc:`CLIP <model_doc/clip>` from (OpenAI) released with the paper `Learning Transferable Visual Models From
+13. :doc:`CANINE <model_doc/canine>` (from Google Research) released with the paper `CANINE: Pre-training an Efficient
+    Tokenization-Free Encoder for Language Representation <https://arxiv.org/abs/2103.06874>`__ by Jonathan H. Clark,
+    Dan Garrette, Iulia Turc, John Wieting.
+14. :doc:`CLIP <model_doc/clip>` (from OpenAI) released with the paper `Learning Transferable Visual Models From
    Natural Language Supervision <https://arxiv.org/abs/2103.00020>`__ by Alec Radford, Jong Wook Kim, Chris Hallacy,
    Aditya Ramesh, Gabriel Goh, Sandhini Agarwal, Girish Sastry, Amanda Askell, Pamela Mishkin, Jack Clark, Gretchen
    Krueger, Ilya Sutskever.
-14. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
+15. :doc:`ConvBERT <model_doc/convbert>` (from YituTech) released with the paper `ConvBERT: Improving BERT with
    Span-based Dynamic Convolution <https://arxiv.org/abs/2008.02496>`__ by Zihang Jiang, Weihao Yu, Daquan Zhou,
    Yunpeng Chen, Jiashi Feng, Shuicheng Yan.
-15. :doc:`CPM <model_doc/cpm>` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative
+16. :doc:`CPM <model_doc/cpm>` (from Tsinghua University) released with the paper `CPM: A Large-scale Generative
    Chinese Pre-trained Language Model <https://arxiv.org/abs/2012.00413>`__ by Zhengyan Zhang, Xu Han, Hao Zhou, Pei
    Ke, Yuxian Gu, Deming Ye, Yujia Qin, Yusheng Su, Haozhe Ji, Jian Guan, Fanchao Qi, Xiaozhi Wang, Yanan Zheng,
    Guoyang Zeng, Huanqi Cao, Shengqi Chen, Daixuan Li, Zhenbo Sun, Zhiyuan Liu, Minlie Huang, Wentao Han, Jie Tang,
    Juanzi Li, Xiaoyan Zhu, Maosong Sun.
-16. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
+17. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
    Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
    Lav R. Varshney, Caiming Xiong and Richard Socher.
-17. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
+18. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT with
    Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu
    Chen.
-18. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
+19. :doc:`DeBERTa-v2 <model_doc/deberta_v2>` (from Microsoft) released with the paper `DeBERTa: Decoding-enhanced BERT
    with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
    Weizhu Chen.
-19. :doc:`DeiT <model_doc/deit>` (from Facebook) released with the paper `Training data-efficient image transformers &
+20. :doc:`DeiT <model_doc/deit>` (from Facebook) released with the paper `Training data-efficient image transformers &
    distillation through attention <https://arxiv.org/abs/2012.12877>`__ by Hugo Touvron, Matthieu Cord, Matthijs
    Douze, Francisco Massa, Alexandre Sablayrolles, Hervé Jégou.
-20. :doc:`DETR <model_doc/detr>` (from Facebook) released with the paper `End-to-End Object Detection with Transformers
+21. :doc:`DETR <model_doc/detr>` (from Facebook) released with the paper `End-to-End Object Detection with Transformers
    <https://arxiv.org/abs/2005.12872>`__ by Nicolas Carion, Francisco Massa, Gabriel Synnaeve, Nicolas Usunier,
    Alexander Kirillov, Sergey Zagoruyko.
-21. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+22. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
    Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
    Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-22. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+23. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
    distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
    Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
    `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
    version of DistilBERT.
-23. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+24. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
    Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
    Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-24. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+25. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
    Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
    Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-25. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+26. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
    Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
    Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-26. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+27. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
    Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
    Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-27. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+28. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
    Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
    and Ilya Sutskever.
-28. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+29. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
    Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
    Luan, Dario Amodei** and Ilya Sutskever**.
-29. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
+30. :doc:`GPT Neo <model_doc/gpt_neo>` (from EleutherAI) released in the repository `EleutherAI/gpt-neo
    <https://github.com/EleutherAI/gpt-neo>`__ by Sid Black, Stella Biderman, Leo Gao, Phil Wang and Connor Leahy.
-30. :doc:`Hubert <model_doc/hubert>` (from Facebook) released with the paper `HuBERT: Self-Supervised Speech
+31. :doc:`Hubert <model_doc/hubert>` (from Facebook) released with the paper `HuBERT: Self-Supervised Speech
    Representation Learning by Masked Prediction of Hidden Units <https://arxiv.org/abs/2106.07447>`__ by Wei-Ning Hsu,
    Benjamin Bolte, Yao-Hung Hubert Tsai, Kushal Lakhotia, Ruslan Salakhutdinov, Abdelrahman Mohamed.
-31. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
+32. :doc:`I-BERT <model_doc/ibert>` (from Berkeley) released with the paper `I-BERT: Integer-only BERT Quantization
    <https://arxiv.org/abs/2101.01321>`__ by Sehoon Kim, Amir Gholami, Zhewei Yao, Michael W. Mahoney, Kurt Keutzer
-32. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+33. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-33. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+34. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-34. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+35. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-35. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
+36. :doc:`LUKE <model_doc/luke>` (from Studio Ousia) released with the paper `LUKE: Deep Contextualized Entity
    Representations with Entity-aware Self-attention <https://arxiv.org/abs/2010.01057>`__ by Ikuya Yamada, Akari Asai,
    Hiroyuki Shindo, Hideaki Takeda, Yuji Matsumoto.
-36. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+37. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
    by Hao Tan and Mohit Bansal.
-37. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
+38. :doc:`M2M100 <model_doc/m2m_100>` (from Facebook) released with the paper `Beyond English-Centric Multilingual
    Machine Translation <https://arxiv.org/abs/2010.11125>`__ by by Angela Fan, Shruti Bhosale, Holger Schwenk, Zhiyi
    Ma, Ahmed El-Kishky, Siddharth Goyal, Mandeep Baines, Onur Celebi, Guillaume Wenzek, Vishrav Chaudhary, Naman
    Goyal, Tom Birch, Vitaliy Liptchinsky, Sergey Edunov, Edouard Grave, Michael Auli, Armand Joulin.
-38. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+39. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
    Translator Team.
-39. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+40. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-40. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
+41. :doc:`MBart-50 <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Translation with Extensible
    Multilingual Pretraining and Finetuning <https://arxiv.org/abs/2008.00401>`__ by Yuqing Tang, Chau Tran, Xian Li,
    Peng-Jen Chen, Naman Goyal, Vishrav Chaudhary, Jiatao Gu, Angela Fan.
-41. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
+42. :doc:`Megatron-BERT <model_doc/megatron_bert>` (from NVIDIA) released with the paper `Megatron-LM: Training
    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-42. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
+43. :doc:`Megatron-GPT2 <model_doc/megatron_gpt2>` (from NVIDIA) released with the paper `Megatron-LM: Training
    Multi-Billion Parameter Language Models Using Model Parallelism <https://arxiv.org/abs/1909.08053>`__ by Mohammad
    Shoeybi, Mostofa Patwary, Raul Puri, Patrick LeGresley, Jared Casper and Bryan Catanzaro.
-43. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+44. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
    Jianfeng Lu, Tie-Yan Liu.
-44. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+45. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
-45. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+46. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
    Mohammad Saleh and Peter J. Liu.
-46. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+47. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-47. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+48. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-48. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+49. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-49. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
+50. :doc:`RoFormer <model_doc/roformer>` (from ZhuiyiTechnology), released together with the paper a `RoFormer:
    Enhanced Transformer with Rotary Position Embedding <https://arxiv.org/pdf/2104.09864v1.pdf>`__ by Jianlin Su and
    Yu Lu and Shengfeng Pan and Bo Wen and Yunfeng Liu.
-50. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
+51. :doc:`SpeechToTextTransformer <model_doc/speech_to_text>` (from Facebook), released together with the paper
    `fairseq S2T: Fast Speech-to-Text Modeling with fairseq <https://arxiv.org/abs/2010.05171>`__ by Changhan Wang, Yun
    Tang, Xutai Ma, Anne Wu, Dmytro Okhonko, Juan Pino.
-51. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
+52. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
    about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
    Krishna, and Kurt W. Keutzer.
-52. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+53. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-53. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+54. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
    Francesco Piccinno and Julian Martin Eisenschlos.
-54. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+55. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-55. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
+56. :doc:`Vision Transformer (ViT) <model_doc/vit>` (from Google AI) released with the paper `An Image is Worth 16x16
    Words: Transformers for Image Recognition at Scale <https://arxiv.org/abs/2010.11929>`__ by Alexey Dosovitskiy,
    Lucas Beyer, Alexander Kolesnikov, Dirk Weissenborn, Xiaohua Zhai, Thomas Unterthiner, Mostafa Dehghani, Matthias
    Minderer, Georg Heigold, Sylvain Gelly, Jakob Uszkoreit, Neil Houlsby.
-56. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
+57. :doc:`VisualBERT <model_doc/visual_bert>` (from UCLA NLP) released with the paper `VisualBERT: A Simple and
    Performant Baseline for Vision and Language <https://arxiv.org/pdf/1908.03557>`__ by Liunian Harold Li, Mark
    Yatskar, Da Yin, Cho-Jui Hsieh, Kai-Wei Chang.
-57. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
+58. :doc:`Wav2Vec2 <model_doc/wav2vec2>` (from Facebook AI) released with the paper `wav2vec 2.0: A Framework for
    Self-Supervised Learning of Speech Representations <https://arxiv.org/abs/2006.11477>`__ by Alexei Baevski, Henry
    Zhou, Abdelrahman Mohamed, Michael Auli.
-58. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+59. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-59. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+60. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-60. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+61. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
    Zettlemoyer and Veselin Stoyanov.
-61. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+62. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-62. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
+63. :doc:`XLSR-Wav2Vec2 <model_doc/xlsr_wav2vec2>` (from Facebook AI) released with the paper `Unsupervised
    Cross-Lingual Representation Learning For Speech Recognition <https://arxiv.org/abs/2006.13979>`__ by Alexis
    Conneau, Alexei Baevski, Ronan Collobert, Abdelrahman Mohamed, Michael Auli.

@@ -324,6 +327,8 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           Canine            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |          ConvBERT           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            DETR             |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
@@ -348,9 +353,9 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           GPT Neo           |       ❌       |       ❌       |       ✅        |         ❌         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           Hubert            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+|           Hubert            |       ❌       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |           I-BERT            |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -368,7 +373,7 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |        MegatronBert         |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -406,7 +411,7 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |         VisualBert          |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+|          Wav2Vec2           |       ✅       |       ❌       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -416,7 +421,7 @@ Flax), PyTorch, and/or TensorFlow.
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
 |             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
 +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
@@ -459,6 +464,7 @@ Flax), PyTorch, and/or TensorFlow.
    add_new_model
    fast_tokenizers
    performance
+    parallelism
    testing
    debugging
    serialization
@@ -508,6 +514,7 @@ Flax), PyTorch, and/or TensorFlow.
    model_doc/bort
    model_doc/byt5
    model_doc/camembert
+    model_doc/canine
    model_doc/clip
    model_doc/convbert
    model_doc/cpm
--- a/docs/source/internal/file_utils.rst
+++ b/docs/source/internal/file_utils.rst
@@ -51,4 +51,4 @@ Special Properties
 Other Utilities
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.file_utils._BaseLazyModule
+.. autoclass:: transformers.file_utils._LazyModule
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@@ -22,4 +22,5 @@ PretrainedConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PretrainedConfig
+    :special-members: push_to_hub
    :members:
--- a/docs/source/main_classes/deepspeed.rst
+++ b/docs/source/main_classes/deepspeed.rst
@@ -73,8 +73,6 @@ or via ``transformers``' ``extras``:

    pip install transformers[deepspeed]

-(will become available starting from ``transformers==4.6.0``)
-
 or find more details on `the DeepSpeed's GitHub page <https://github.com/microsoft/deepspeed#installation>`__ and
 `advanced install <https://www.deepspeed.ai/tutorials/advanced-install/>`__.

@@ -90,20 +88,31 @@ To make a local build for DeepSpeed:
    git clone https://github.com/microsoft/DeepSpeed/
    cd DeepSpeed
    rm -rf build
-    TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 pip install . \
+    TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 pip install . \
    --global-option="build_ext" --global-option="-j8" --no-cache -v \
    --disable-pip-version-check 2>&1 | tee build.log

-Edit ``TORCH_CUDA_ARCH_LIST`` to insert the code for the architectures of the GPU cards you intend to use.
+If you intend to use NVMe offload you will need to also include ``DS_BUILD_AIO=1`` in the instructions above (and also
+install `libaio-dev` system-wide).

-Or if you need to use the same setup on multiple machines, make a binary wheel:
+Edit ``TORCH_CUDA_ARCH_LIST`` to insert the code for the architectures of the GPU cards you intend to use. Assuming all
+your cards are the same you can get the arch via:
+
+.. code-block:: bash
+
+    CUDA_VISIBLE_DEVICES=0 python -c "import torch; print(torch.cuda.get_device_capability())"
+
+So if you get ``8, 6``, then use ``TORCH_CUDA_ARCH_LIST="8.6"``. If you have multiple different cards, you can list all
+of them like so ``TORCH_CUDA_ARCH_LIST="6.1;8.6"``
+
+If you need to use the same setup on multiple machines, make a binary wheel:

 .. code-block:: bash

    git clone https://github.com/microsoft/DeepSpeed/
    cd DeepSpeed
    rm -rf build
-    TORCH_CUDA_ARCH_LIST="6.1;8.6" DS_BUILD_OPS=1 \
+    TORCH_CUDA_ARCH_LIST="8.6" DS_BUILD_CPU_ADAM=1 DS_BUILD_UTILS=1 \
    python setup.py build_ext -j8 bdist_wheel

 it will generate something like ``dist/deepspeed-0.3.13+8cd046f-cp38-cp38-linux_x86_64.whl`` which now you can install
@@ -692,7 +701,17 @@ be ignored.

 - ``sub_group_size``: ``1e9``

-This one does impact GPU memory usage. But no docs at the moment on Deepspeed side to explain the tuning.
+``sub_group_size`` controls the granularity in which parameters are updated during optimizer steps. Parameters are
+grouped into buckets of ``sub_group_size`` and each buckets is updated one at a time. When used with NVMe offload in
+ZeRO-Infinity, ``sub_group_size`` therefore controls the granularity in which model states are moved in and out of CPU
+memory from NVMe during the optimizer step. This prevents running out of CPU memory for extremely large models.
+
+You can leave ``sub_group_size`` to its default value of `1e9` when not using NVMe offload. You may want to change its
+default value in the following cases:
+
+1. Running into OOM during optimizer step: Reduce ``sub_group_size`` to reduce memory utilization of temporary buffers
+2. Optimizer Step is taking a long time: Increase ``sub_group_size`` to improve bandwidth utilization as a result of
+   the increased data buffers.


 .. _deepspeed-nvme:
@@ -1042,7 +1061,8 @@ optimizers, with the exception of using the combination of HuggingFace scheduler
 | DS Optimizer | No           | Yes          |
 +--------------+--------------+--------------+

-If ``offload_optimizer`` is enabled you must use both DeepSpeed scheduler and DeepSpeed optimizer.
+It is possible to use a non-DeepSpeed optimizer when ``offload_optimizer`` is enabled, as long as it has both CPU and
+GPU implementation (except LAMB).



@@ -1136,8 +1156,8 @@ Here is where the schedulers overlap between 🤗 Transformers and DeepSpeed:
  therefore, if you don't configure the scheduler this is scheduler that will get configured by default.

 If you don't configure the ``scheduler`` entry in the configuration file, the :class:`~transformers.Trainer` will use
-the values of ``--lr_scheduler_type``, ``--learning_rate`` and ``--warmup_steps`` to configure a 🤗 Transformers version
-of it.
+the values of ``--lr_scheduler_type``, ``--learning_rate`` and ``--warmup_steps`` or ``--warmup_ratio`` to configure a
+🤗 Transformers version of it.

 Here is an example of the auto-configured ``scheduler`` entry for ``WarmupLR``:

@@ -1158,9 +1178,10 @@ Since `"auto"` is used the :class:`~transformers.Trainer` arguments will set the
 file. This is so that there is one definitive source of the values and to avoid hard to find errors when, for example,
 the learning rate is set to different values in different places. Command line rules. The values that get set are:

- ``warmup_min_lr`` with the value of ``0``
- ``warmup_max_lr`` with the value of ``--learning_rate``
- ``warmup_num_steps`` with the value of ``--warmup_steps``
+- ``warmup_min_lr`` with the value of ``0``.
+- ``warmup_max_lr`` with the value of ``--learning_rate``.
+- ``warmup_num_steps`` with the value of ``--warmup_steps`` if provided. Otherwise will use ``--warmup_ratio``
+  multiplied by the number of training steps and rounded up.
 - ``total_num_steps`` with either the value of ``--max_steps`` or if it is not provided, derived automatically at run
  time based on the environment and the size of the dataset and other command line arguments (needed for
  ``WarmupDecayLR``).
@@ -1437,8 +1458,56 @@ won't be possible to load it back.

 While the fp16 weights are fine for resuming training, if you finished finetuning your model and want to upload it to
 the `models hub <https://huggingface.co/models>`__ or pass it to someone else you most likely will want to get the fp32
-weights. This cannot be done during training since this is a process that requires a lot of memory, and therefore this
-is performed offline.
+weights. This ideally shouldn't be done during training since this is a process that requires a lot of memory, and
+therefore best to be performed offline after the training is complete. But if desired and you have plenty of free CPU
+memory it can be done in the same training script. The following sections will discuss both approaches.
+
+
+**Live FP32 Weights Recovery:**
+
+This approach may not work if you model is large and you have little free CPU memory left, at the end of the training.
+
+If you have saved at least one checkpoint, and you want to use the latest one, you can do the following:
+
+.. code-block:: python
+
+    from transformers.trainer_utils import get_last_checkpoint
+    from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+    checkpoint_dir = get_last_checkpoint(trainer.args.output_dir)
+    fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+
+If you're using the ``--load_best_model_at_end`` class:`~transformers.TrainingArguments` argument (to track the best
+checkpoint), then you can finish the training by first saving the final model explicitly and then do the same as above:
+
+.. code-block:: python
+
+    from deepspeed.utils.zero_to_fp32 import load_state_dict_from_zero_checkpoint
+    checkpoint_dir = os.path.join(trainer.args.output_dir, "checkpoint-final")
+    trainer.deepspeed.save_checkpoint(checkpoint_dir)
+    fp32_model = load_state_dict_from_zero_checkpoint(trainer.model, checkpoint_dir)
+
+.. note::
+
+    Note, that once ``load_state_dict_from_zero_checkpoint`` was run, the ``model`` will no longer be useable in the
+    DeepSpeed context of the same application. i.e. you will need to re-initialize the deepspeed engine, since
+    ``model.load_state_dict(state_dict)`` will remove all the DeepSpeed magic from it. So do this only at the very end
+    of the training.
+
+Of course, you don't have to use class:`~transformers.Trainer` and you can adjust the examples above to your own
+trainer.
+
+If for some reason you want more refinement, you can also extract the fp32 ``state_dict`` of the weights and apply
+these yourself as is shown in the following example:
+
+.. code-block:: python
+
+    from deepspeed.utils.zero_to_fp32 import get_fp32_state_dict_from_zero_checkpoint
+    state_dict = get_fp32_state_dict_from_zero_checkpoint(checkpoint_dir) # already on cpu
+    model = model.cpu()
+    model.load_state_dict(state_dict)
+
+
+**Offline FP32 Weights Recovery:**

 DeepSpeed creates a special conversion script ``zero_to_fp32.py`` which it places in the top-level of the checkpoint
 folder. Using this script you can extract the weights at any point. The script is standalone and you no longer need to
@@ -1467,15 +1536,16 @@ weights just run:

 .. code-block:: bash

-    python zero_to_fp32.py global_step1 pytorch_model.bin
+    python zero_to_fp32.py . pytorch_model.bin

-The script will automatically handle either ZeRO-2 or ZeRO-3 checkpoint.
+This is it. ``pytorch_model.bin`` will now contain the full fp32 model weights consolidated from multiple GPUs.
+
+The script will automatically be able to handle either a ZeRO-2 or ZeRO-3 checkpoint.

 ``python zero_to_fp32.py -h`` will give you usage details.

-If you have multiple DeepSpeed checkpoint sub-folders, pick the one you know to have the desired weights.
-
-This is it. ``pytorch_model.bin`` will now contain the full fp32 model weights consolidated from multiple GPUs.
+The script will auto-discover the deepspeed sub-folder using the contents of the file ``latest``, which in the current
+example will contain ``global_step1``.

 Note: currently the script requires 2x general RAM of the final fp32 model weights.

@@ -1530,6 +1600,8 @@ Note: If the fp16 weights of the model can't fit onto the memory of a single GPU
 For full details on this method and other related features please refer to `Constructing Massive Models
 <https://deepspeed.readthedocs.io/en/latest/zero3.html#constructing-massive-models>`__.

+Also when loading fp16-pretrained models, you will want to tell ``from_pretrained`` to use
+``torch_dtype=torch.float16``. For details, please, see :ref:`from_pretrained-torch-dtype`.


 Gathering Parameters
@@ -1555,6 +1627,56 @@ stress on ``tensor([1.])``, or if you get an error where it says the parameter i
 larger multi-dimensional shape, this means that the parameter is partitioned and what you see is a ZeRO-3 placeholder.


+
+
+Filing Issues
+=======================================================================================================================
+
+Here is how to file an issue so that we could quickly get to the bottom of the issue and help you to unblock your work.
+
+In your report please always include:
+
+1. the full Deepspeed config file in the report
+
+2. either the command line arguments if you were using the :class:`~transformers.Trainer` or
+   :class:`~transformers.TrainingArguments` arguments if you were scripting the Trainer setup yourself. Please do not
+   dump the :class:`~transformers.TrainingArguments` as it has dozens of entries that are irrelevant.
+
+3. Output of:
+
+.. code-block:: bash
+
+    python -c 'import torch; print(f"torch: {torch.__version__}")'
+    python -c 'import transformers; print(f"transformers: {transformers.__version__}")'
+    python -c 'import deepspeed; print(f"deepspeed: {deepspeed.__version__}")'
+
+4. If possible include a link to a Google Colab notebook that we can reproduce the problem with. You can use this
+   `notebook <https://github.com/stas00/porting/blob/master/transformers/deepspeed/DeepSpeed_on_colab_CLI.ipynb>`__ as
+   a starting point.
+
+5. Unless it's impossible please always use a standard dataset that we can use and not something custom.
+
+6. If possible try to use one of the existing `examples
+   <https://github.com/huggingface/transformers/tree/master/examples/pytorch>`__ to reproduce the problem with.
+
+Things to consider:
+
+* Deepspeed is often not the cause of the problem.
+
+    Some of the filed issues proved to be Deepspeed-unrelated. That is once Deepspeed was removed from the setup, the
+    problem was still there.
+
+    Therefore, if it's not absolutely obvious it's a DeepSpeed-related problem, as in you can see that there is an
+    exception and you can see that DeepSpeed modules are involved, first re-test your setup without DeepSpeed in it.
+    And only if the problem persists then do mentioned Deepspeed and supply all the required details.
+
+* If it's clear to you that the issue is in the DeepSpeed core and not the integration part, please file the Issue
+  directly with `Deepspeed <https://github.com/microsoft/DeepSpeed/>`__. If you aren't sure, please do not worry,
+  either Issue tracker will do, we will figure it out once you posted it and redirect you to another Issue tracker if
+  need be.
+
+
+
 Troubleshooting
 =======================================================================================================================

--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -1,4 +1,4 @@
-.. 
+..
    Copyright 2020 The HuggingFace Team. All rights reserved.

    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
@@ -35,9 +35,41 @@ PreTrainedModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PreTrainedModel
+    :special-members: push_to_hub
    :members:


+.. _from_pretrained-torch-dtype:
+
+Model Instantiation dtype
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Under Pytorch a model normally gets instantiated with ``torch.float32`` format. This can be an issue if one tries to
+load a model whose weights are in fp16, since it'd require twice as much memory. To overcome this limitation, you can
+either explicitly pass the desired ``dtype`` using ``torch_dtype`` argument:
+
+.. code-block:: python
+
+    model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype=torch.float16)
+
+or, if you want the model to always load in the most optimal memory pattern, you can use the special value ``"auto"``,
+and then ``dtype`` will be automatically derived from the model's weights:
+
+.. code-block:: python
+
+    model = T5ForConditionalGeneration.from_pretrained("t5", torch_dtype="auto")
+
+Models instantiated from scratch can also be told which ``dtype`` to use with:
+
+.. code-block:: python
+
+    config = T5Config.from_pretrained("t5")
+    model = AutoModel.from_config(config)
+
+Due to Pytorch design, this functionality is only available for floating dtypes.
+
+
+
 ModuleUtilsMixin
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -49,6 +81,7 @@ TFPreTrainedModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFPreTrainedModel
+    :special-members: push_to_hub
    :members:


@@ -63,6 +96,7 @@ FlaxPreTrainedModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.FlaxPreTrainedModel
+    :special-members: push_to_hub
    :members:


--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -53,10 +53,8 @@ PreTrainedTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PreTrainedTokenizer
-    :special-members: __call__
-    :members: batch_decode, convert_ids_to_tokens, convert_tokens_to_ids, convert_tokens_to_string, decode, encode, 
-        get_added_vocab, get_special_tokens_mask, num_special_tokens_to_add, prepare_for_tokenization, tokenize,
-        vocab_size
+    :special-members: __call__, batch_decode, decode, encode, push_to_hub
+    :members: 


 PreTrainedTokenizerFast
@@ -68,10 +66,8 @@ loaded very simply into 🤗 transformers. Take a look at the :doc:`Using tokeni
 <../fast_tokenizers>` page to understand how this is done.

 .. autoclass:: transformers.PreTrainedTokenizerFast
-    :special-members: __call__
-    :members: batch_decode, convert_ids_to_tokens, convert_tokens_to_ids, convert_tokens_to_string, decode, encode, 
-        get_added_vocab, get_special_tokens_mask, num_special_tokens_to_add,
-        set_truncation_and_padding,tokenize, vocab_size
+    :special-members: __call__, batch_decode, decode, encode, push_to_hub
+    :members:


 BatchEncoding
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -147,7 +147,7 @@ Here is an example of how this can be used in an application:

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
--- a/docs/source/model_doc/canine.rst
+++ b/docs/source/model_doc/canine.rst
@@ -0,0 +1,155 @@
+.. 
+    Copyright 2021 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+CANINE
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The CANINE model was proposed in `CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language
+Representation <https://arxiv.org/abs/2103.06874>`__ by Jonathan H. Clark, Dan Garrette, Iulia Turc, John Wieting. It's
+among the first papers that trains a Transformer without using an explicit tokenization step (such as Byte Pair
+Encoding (BPE), WordPiece or SentencePiece). Instead, the model is trained directly at a Unicode character-level.
+Training at a character-level inevitably comes with a longer sequence length, which CANINE solves with an efficient
+downsampling strategy, before applying a deep Transformer encoder.
+
+The abstract from the paper is the following:
+
+*Pipelined NLP systems have largely been superseded by end-to-end neural modeling, yet nearly all commonly-used models
+still require an explicit tokenization step. While recent tokenization approaches based on data-derived subword
+lexicons are less brittle than manually engineered tokenizers, these techniques are not equally suited to all
+languages, and the use of any fixed vocabulary may limit a model's ability to adapt. In this paper, we present CANINE,
+a neural encoder that operates directly on character sequences, without explicit tokenization or vocabulary, and a
+pre-training strategy that operates either directly on characters or optionally uses subwords as a soft inductive bias.
+To use its finer-grained input effectively and efficiently, CANINE combines downsampling, which reduces the input
+sequence length, with a deep transformer stack, which encodes context. CANINE outperforms a comparable mBERT model by
+2.8 F1 on TyDi QA, a challenging multilingual benchmark, despite having 28% fewer model parameters.*
+
+Tips:
+
+- CANINE uses no less than 3 Transformer encoders internally: 2 "shallow" encoders (which only consist of a single
+  layer) and 1 "deep" encoder (which is a regular BERT encoder). First, a "shallow" encoder is used to contextualize
+  the character embeddings, using local attention. Next, after downsampling, a "deep" encoder is applied. Finally,
+  after upsampling, a "shallow" encoder is used to create the final character embeddings. Details regarding up- and
+  downsampling can be found in the paper.
+- CANINE uses a max sequence length of 2048 characters by default. One can use :class:`~transformers.CanineTokenizer`
+  to prepare text for the model.
+- Classification can be done by placing a linear layer on top of the final hidden state of the special [CLS] token
+  (which has a predefined Unicode code point). For token classification tasks however, the downsampled sequence of
+  tokens needs to be upsampled again to match the length of the original character sequence (which is 2048). The
+  details for this can be found in the paper.
+-  Models:
+
+      - `google/canine-c <https://huggingface.co/google/canine-c>`__: Pre-trained with autoregressive character loss,
+        12-layer, 768-hidden, 12-heads, 121M parameters (size ~500 MB).
+      - `google/canine-s <https://huggingface.co/google/canine-s>`__: Pre-trained with subword loss, 12-layer,
+        768-hidden, 12-heads, 121M parameters (size ~500 MB).
+
+This model was contributed by `nielsr <https://huggingface.co/nielsr>`__. The original code can be found `here
+<https://github.com/google-research/language/tree/master/language/canine>`__.
+
+
+Example
+_______________________________________________________________________________________________________________________
+
+CANINE works on raw characters, so it can be used without a tokenizer:
+
+.. code-block::
+
+    from transformers import CanineModel
+    import torch
+
+    model = CanineModel.from_pretrained('google/canine-c') # model pre-trained with autoregressive character loss
+
+    text = "hello world"
+    # use Python's built-in ord() function to turn each character into its unicode code point id
+    input_ids = torch.tensor([[ord(char) for char in text]])
+
+    outputs = model(input_ids) # forward pass
+    pooled_output = outputs.pooler_output
+    sequence_output = outputs.last_hidden_state
+
+
+For batched inference and training, it is however recommended to make use of the tokenizer (to pad/truncate all
+sequences to the same length):
+
+.. code-block::
+
+    from transformers import CanineTokenizer, CanineModel
+
+    model = CanineModel.from_pretrained('google/canine-c')
+    tokenizer = CanineTokenizer.from_pretrained('google/canine-c')
+
+    inputs = ["Life is like a box of chocolates.", "You never know what you gonna get."]
+    encoding = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt")
+
+    outputs = model(**encoding) # forward pass
+    pooled_output = outputs.pooler_output
+    sequence_output = outputs.last_hidden_state
+
+
+CANINE specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.canine.modeling_canine.CanineModelOutputWithPooling
+    :members:
+
+
+CanineConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CanineConfig
+    :members:
+
+
+CanineTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CanineTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences
+
+
+CanineModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CanineModel
+    :members: forward
+
+
+CanineForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CanineForSequenceClassification
+    :members: forward
+
+
+CanineForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CanineForMultipleChoice
+    :members: forward
+
+
+CanineForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CanineForTokenClassification
+    :members: forward
+
+
+CanineForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CanineForQuestionAnswering
+    :members: forward
--- a/docs/source/model_doc/clip.rst
+++ b/docs/source/model_doc/clip.rst
@@ -60,7 +60,6 @@ encode the text and prepare the images. The following example shows how to get t

 .. code-block::

-        >>> import torch
        >>> from PIL import Image
        >>> import requests

--- a/docs/source/model_doc/gpt_neo.rst
+++ b/docs/source/model_doc/gpt_neo.rst
@@ -71,3 +71,16 @@ GPTNeoForSequenceClassification

 .. autoclass:: transformers.GPTNeoForSequenceClassification
    :members: forward
+
+FlaxGPTNeoModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxGPTNeoModel
+    :members: __call__
+
+
+FlaxGPTNeoForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxGPTNeoForCausalLM
+    :members: __call__
--- a/docs/source/model_doc/hubert.rst
+++ b/docs/source/model_doc/hubert.rst
@@ -63,3 +63,16 @@ HubertForCTC

 .. autoclass:: transformers.HubertForCTC
    :members: forward
+
+TFHubertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFHubertModel
+    :members: call
+
+
+TFHubertForCTC
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFHubertForCTC
+    :members: call
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -216,3 +216,17 @@ TFMarianMTModel

 .. autoclass:: transformers.TFMarianMTModel
    :members: call
+
+
+FlaxMarianModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMarianModel
+    :members: __call__
+
+
+FlaxMarianMTModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMarianMTModel
+    :members: __call__
--- a/docs/source/model_doc/mbart.rst
+++ b/docs/source/model_doc/mbart.rst
@@ -240,3 +240,31 @@ TFMBartForConditionalGeneration

 .. autoclass:: transformers.TFMBartForConditionalGeneration
    :members: call
+
+
+FlaxMBartModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMBartModel
+    :members: __call__, encode, decode
+
+
+FlaxMBartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMBartForConditionalGeneration
+    :members: __call__, encode, decode
+
+
+FlaxMBartForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMBartForSequenceClassification
+    :members: __call__, encode, decode
+
+
+FlaxMBartForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxMBartForQuestionAnswering
+    :members: __call__, encode, decode
--- a/docs/source/model_doc/roformer.rst
+++ b/docs/source/model_doc/roformer.rst
@@ -56,7 +56,7 @@ RoFormerTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


-RobertaTokenizerFast
+RoFormerTokenizerFast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RoFormerTokenizerFast
--- a/docs/source/model_doc/wav2vec2.rst
+++ b/docs/source/model_doc/wav2vec2.rst
@@ -99,3 +99,23 @@ TFWav2Vec2ForCTC

 .. autoclass:: transformers.TFWav2Vec2ForCTC
    :members: call
+
+
+FlaxWav2Vec2Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxWav2Vec2Model
+    :members: __call__
+
+
+FlaxWav2Vec2ForCTC
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxWav2Vec2ForCTC
+    :members: __call__
+
+FlaxWav2Vec2ForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxWav2Vec2ForPreTraining
+    :members: __call__
--- a/docs/source/parallelism.md
+++ b/docs/source/parallelism.md
@@ -0,0 +1,354 @@
+<!---
+Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Model Parallelism
+
+
+## Parallelism overview
+
+In the modern machine learning the various approaches to parallelism are used to:
+1. fit very large models onto limited hardware - e.g. t5-11b is 45GB in just model params
+2. significantly speed up training - finish training that would take a year in hours
+
+We will first discuss in depth various 1D parallelism techniques and their pros and cons and then look at how they can be combined into 2D and 3D parallelism to enable an even faster training and to support even bigger models. Various other powerful alternative approaches will be presented.
+
+While the main concepts most likely will apply to any other framework, this article is focused on PyTorch-based implementations.
+
+
+## Concepts
+
+The following is the brief description of the main concepts that will be described later in depth in this document.
+
+1. DataParallel (DP) - the same setup is replicated multiple times, and each being fed a slice of the data. The processing is done in parallel and all setups are synchronized at the end of each training step.
+2. TensorParallel (TP) - each tensor is split up into multiple chunks, so instead of having the whole tensor reside on a single gpu, each shard of the tensor resides on its designated gpu. During processing each shard gets processed separately and in parallel on different GPUs and the results are synced at the end of the step. This is what one may call horizontal parallelism, as the splitting happens on horizontal level.
+3. PipelineParallel (PP) - the model is split up vertically (layer-level) across multiple GPUs, so that only one or several layers of the model are places on a single gpu. Each gpu processes in parallel different stages of the pipeline and working on a small chunk of the batch.
+4. Zero Redundancy Optimizer (ZeRO) - Also performs sharding of the tensors somewhat similar to TP, except the whole tensor gets reconstructed in time for a forward or backward computation, therefore the model does't need to be modified. It also supports various offloading techniques to compensate for limited GPU memory.
+5. Sharded DDP - is another name for the foundational ZeRO concept as used by various other implementations of ZeRO.
+
+
+## Data Parallel
+
+Most users with just 2 GPUs already enjoy the increased training speed up thanks to DataParallel (DP) and DistributedDataParallel (DDP) that are almost trivial to use. This is a built-in feature of Pytorch.
+
+## ZeRO Data Parallel
+
+ZeRO-powered data parallelism (ZeRO-DP) is described on the following diagram from this [blog post](https://www.microsoft.com/en-us/research/blog/zero-deepspeed-new-system-optimizations-enable-training-models-with-over-100-billion-parameters/)
+![DeepSpeed-Image-1](imgs/parallelism-zero.png)
+
+It can be difficult to wrap one's head around it, but in reality the concept is quite simple. This is just the usual DataParallel (DP), except, instead of replicating the full model params, gradients and optimizer states, each GPU stores only a slice of it.  And then at run-time when the full layer params are needed just for the given layer, all GPUs synchronize to give each other parts that they miss - this is it.
+
+Consider this simple model with 3 layers, where each layer has 3 params:
+```
+La | Lb | Lc
+---|----|---
+a0 | b0 | c0
+a1 | b1 | c1
+a2 | b2 | c2
+```
+Layer La has weights a0, at and a2.
+
+If we have 3 GPUs, the Sharded DDP (= Zero-DP) splits the model onto 3 GPUs like so:
+
+```
+GPU0:
+La | Lb | Lc
+---|----|---
+a0 | b0 | c0
+
+GPU1:
+La | Lb | Lc
+---|----|---
+a1 | b1 | c1
+
+GPU2:
+La | Lb | Lc
+---|----|---
+a2 | b2 | c2
+```
+
+In a way this is the same horizontal slicing, as tensor parallelism, if you imagine the typical DNN diagram. Vertical slicing is where one puts whole layer-groups on different GPUs. But it's just the starting point.
+
+Now each of these GPUs will get the usual mini-batch as it works in DP:
+```
+x0 => GPU0
+x1 => GPU1
+x2 => GPU2
+```
+
+The inputs are unmodified - they think they are going to be processed by the normal model.
+
+First, the inputs hit the layer La.
+
+Let's focus just on GPU0: x0 needs a0, a1, a2 params to do its forward path, but GPU0 has only a0 - it gets sent a1 from GPU1 and a2 from GPU2, bringing all pieces of the model together.
+
+In parallel, GPU1 gets mini-batch x1 and it only has a1, but needs a0 and a2 params, so it gets those from GPU0 and GPU2.
+
+Same happens to GPU2 that gets input x2. It gets a0 and a1 from GPU0 and GPU1, and with its a2 it reconstructs the full tensor.
+
+All 3 GPUs get the full tensors reconstructed and a forward happens.
+
+As soon as the calculation is done, the data that is no longer needed gets dropped - it's only used during the calculation. The reconstruction is done efficiently via a pre-fetch.
+
+And the whole process is repeated for layer Lb, then Lc forward-wise, and then backward Lc -> Lb -> La.
+
+To me this sounds like an efficient group backpacking weight distribution strategy:
+
+1. person A carries the tent
+2. person B carries the stove
+3. person C carries the axe
+
+Now each night they all share what they have with others and get from others what the don't have, and in the morning they pack up their allocated type of gear and continue on their way. This is Sharded DDP / Zero DP.
+
+Compare this strategy to the simple one where each person has to carry their own tent, stove and axe, which would be far more inefficient. This is DataParallel (DP and DDP) in Pytorch.
+
+While reading the literature on this topic you may encounter the following synonyms: Sharded, Partitioned.
+
+If you pay close attention the way ZeRO partitions the model's weights - it looks very similar to tensor parallelism which will be discussed later. This is because it partitions/shards each layer's weights, unlike vertical model parallelism which is discussed next.
+
+Implementations:
+
+- [DeepSpeed](https://www.deepspeed.ai/features/#the-zero-redundancy-optimizer) ZeRO-DP stages 1+2+3
+- [Fairscale](https://github.com/facebookresearch/fairscale/#optimizer-state-sharding-zero) ZeRO-DP stages 1+2+3
+- [`transformers` integration](https://huggingface.co/transformers/master/main_classes/trainer.html#trainer-integrations)
+
+## Naive Model Parallel (Vertical) and Pipeline Parallel
+
+Naive Model Parallel (MP) is where one spreads groups of model layers across multiple GPUs. The mechanism is relatively simple - switch the desired layers `.to()` the desired devices and now whenever the data goes in and out those layers switch the data to the same device as the layer and leave the rest unmodified.
+
+We refer to it as Vertical MP, because if you remember how most models are drawn, we slice the layers vertically. For example, if the following diagram shows an 8-layer model:
+
+```
+===================  ===================
+|  0 | 1 | 2 | 3  |  |  4 | 5 | 6 | 7  |
+===================  ===================
+        gpu0                 gpu1
+```
+we just sliced it in 2 vertically, placing layers 0-3 onto GPU0 and 4-7 to GPU1.
+
+Now while data travels from layer 0 to 1, 1 to 2 and 2 to 3 this is just the normal model. But when data needs to pass from layer 3 to layer 4 it needs to travel from GPU0 to GPU1 which introduces a communication overhead. If the participating GPUs are on the same compute node (e.g. same physical machine) this copying is pretty fast, but if the GPUs are located on different compute nodes (e.g. multiple machines) the communication overhead could be significantly larger.
+
+Then layers 4 to 5 to 6 to 7 are as a normal model would have and when the 7th layer completes we often need to send the data back to layer 0 where the labels are (or alternatively send the labels to the the last layer). Now the loss can be computed and the optimizer can do its work.
+
+Problems:
+- the main deficiency and why this one is called "naive" MP, is that all but one GPU is idle at any given moment. So if 4 GPUs are used, it's almost identical to quadrupling the amount of memory of a single GPU, and ignoring the rest of the hardware. Plus there is the overhead of copying the data between devices. So 4x 6GB cards will be able to accommodate the same size as 1x 24GB card using naive MP, except the latter will complete the training faster, since it doesn't have the data copying overhead. But, say, if you have 40GB cards and need to fit a 45GB model you can with 4x 40GB cards (but barely because of the gradient and optimizer states)
+- shared embeddings may need to get copied back and forth between GPUs.
+
+Pipeline Parallel (PP) is almost identical to a naive MP, but it solves the GPU idling problem, by chunking the incoming batch into micro-batches and artificially creating a pipeline, which allows different GPUs to concurrently participate in the computation process.
+
+The following illustration from the [GPipe paper](https://ai.googleblog.com/2019/03/introducing-gpipe-open-source-library.html) shows the naive MP on the top, and PP on the bottom:
+
+![mp-pp](imgs/parallelism-gpipe-bubble.png)
+
+It's easy to see from the bottom diagram how PP has less dead zones, where GPUs are idle. The idle parts are referred to as the "bubble".
+
+Both parts of the diagram show a parallelism that is of degree 4. That is 4 GPUs are participating in the pipeline. So there is the forward path of 4 pipe stages F0, F1, F2 and F3 and then the return reverse order backward path of B3, B2, B1 and B0.
+
+PP introduces a new hyper-parameter to tune and it's `chunks` which defines how many chunks of data are sent in a sequence through the same pipe stage. For example, in the bottomw diagram you can see that `chunks=4`. GPU0 performs the same forward path on chunk 0, 1, 2 and 3 (F0,0, F0,1, F0,2, F0,3) and then it waits for other GPUs to do their work and only when their work is starting to be complete, GPU0 starts to work again doing the backward path for chunks 3, 2, 1 and 0 (B0,3, B0,2, B0,1, B0,0).
+
+Note that conceptually this is the same concept as gradient accumulation steps (GAS). Pytorch uses `chunks`, whereas DeepSpeed refers to the same hyper-parameter as GAS.
+
+Because of the chunks, PP introduces the concept of micro-batches (MBS). DP splits the global data batch size into mini-batches, so if you have a DP degree of 4, a global batch size of 1024 gets split up into 4 mini-batches of 256 each (1024/4). And if the number of `chunks` (or GAS) is 32 we end up with a micro-batch size of 8 (256/32). Each Pipeline stage works with a single micro-batch at a time.
+
+To calculate the global batch size of the DP + PP setup we then do: `mbs*chunks*dp_degree` (`8*32*4=1024`).
+
+Let's go back to the diagram.
+
+With `chunks=1` you end up with the naive MP, which is very inefficient. With a very large `chunks` value you end up with tiny micro-batch sizes which could be not every efficient either. So one has to experiment to find the value that leads to the highest efficient utilization of the gpus.
+
+While the diagram shows that there is a bubble of "dead" time that can't be parallelized because the last `forward` stage has to wait for `backward` to complete the pipeline, the purpose of finding the best value for `chunks` is to enable a high concurrent GPU utilization across all participating GPUs which translates to minimizing the size of the bubble.
+
+Problems:
+- have to modify the model quite heavily, because Pipeline requires one to rewrite the normal flow of modules into a `nn.Sequential` sequence of the same, which may require changes to the design of the model.
+- currently the Pipeline API is very restricted. If you had a bunch of python variables being passed in the very first stage of the Pipeline, you will have to find a way around it. Currently, the pipeline interface requires either a single Tensor or a tuple of Tensors as the only input and output. These tensors must have a batch size as the very first dimension, since pipeline is going to chunk the mini batch into micro-batches. Possible improvements are being discussed here https://github.com/pytorch/pytorch/pull/50693
+- have to arrange each layer so that the output of one model becomes an input to the other model
+
+Implementations:
+- [Pytorch](https://pytorch.org/docs/stable/pipeline.html) (initial support in pytorch-1.8, and progressively getting improved in 1.9 and more so in 1.10). Some [examples](https://github.com/pytorch/pytorch/blob/master/benchmarks/distributed/pipeline/pipe.py)
+- [FairScale](https://fairscale.readthedocs.io/en/latest/tutorials/pipe.html)
+- [DeepSpeed](https://www.deepspeed.ai/tutorials/pipeline/)
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation - no API.
+
+🤗 Transformers status: as of this writing none of the models supports full-PP. GPT2 and T5 models have naive PP support. The main obstacle is being unable to convert the models to `nn.Sequential` and have all the inputs to be Tensors. This is because currently the models include many features that make the conversion very complicated, and will need to be removed to accomplish that.
+
+Other approaches:
+
+DeepSpeed and SageMaker use the concept of an [Interleaved Pipeline](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html)
+![interleaved-pipeline-execution](imgs/parallelism-sagemaker-interleaved-pipeline.png)
+
+Here the bubble (idle time) is further minimized by prioritizing backward passes.
+
+According to [the same document](https://docs.aws.amazon.com/sagemaker/latest/dg/model-parallel-core-features.html), it might be able to automate the non `nn.Sequential` model conversion to pipeline. The only problem is that this is currently only available at AWS, so you can't run it on your own hardware.
+
+
+## Tensor Parallelism
+
+In Tensor Parallelism each GPU processes only a slice of a tensor and only aggregates the full tensor for operations that require the whole thing.
+
+In this section we use concepts and diagrams from the [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) paper: [Efficient Large-Scale Language Model Training on GPU Clusters](https://arxiv.org/abs/2104.04473).
+
+The main building block of any transformer is a fully connected `nn.Linear` followed by a nonlinear activation `GeLU`.
+
+Following the Megatron's paper notation, we can write the dot-product part of it as `Y = GeLU(XA)`, where `X` and `Y` are the input and output vectors, and `A` is the weight matrix.
+
+If we look at the computation in matrix form, it's easy to see how the matrix multiplication can be split between multiple GPUs:
+![Parallel GEMM](imgs/parallelism-tp-parallel_gemm.png)
+
+If we split the weight matrix `A` column-wise across `N` GPUs and perform matrix multiplications `XA_1` through `XA_n` in parallel, then we will end up with `N` output vectors `Y_1, Y_2, ..., Y_n` which can be fed into `GeLU` independently:
+![independent GeLU](imgs/parallelism-tp-independent-gelu.png)
+
+Using this principle, we can update an MLP of arbitrary depth, without the need for any synchronization between GPUs until the very end, where we need to reconstruct the output vector from shards. The Megatron-LM paper authors provide a helpful illustration for that:
+![parallel shard processing](imgs/parallelism-tp-parallel_shard_processing.png)
+
+Parallelizing the multi-headed attention layers is even simpler, since they are already inherently parallel, due to having multiple independent heads!
+![parallel self-attention](imgs/parallelism-tp-parallel_self_attention.png)
+
+Special considerations: TP requires very fast network, and therefore it's not advisable to do TP across more than one node. Practically, if a node has 4 GPUs, the highest TP degree is therefore 4. If you need a TP degree of 8, you need to use nodes that have at least 8 GPUs.
+
+This section is based on the original much more [detailed TP overview](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530).
+by [@anton-l](https://github.com/anton-l).
+
+Implementations:
+- DeepSpeed calls it [tensor slicing](https://www.deepspeed.ai/features/#model-parallelism)
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM) has an internal implementation.
+
+🤗 Transformers status:
+- core: not yet implemented in the core
+- but if you want inference [parallelformers](https://github.com/tunib-ai/parallelformers) provides this support for most of our models. So until this is implemented in the core you can use theirs. And hopefully training mode will be supported too.
+- Deepspeed-Inference also supports our BERT, GPT-2, and GPT-Neo models in their super-fast CUDA-kernel-based inference mode, see more [here](https://www.deepspeed.ai/tutorials/inference-tutorial/)
+
+
+
+## DP+PP
+
+The following diagram from the DeepSpeed [pipeline tutorial](https://www.deepspeed.ai/tutorials/pipeline/) demonstrates how one combines DP with PP.
+
+![dp-pp-2d](imgs/parallelism-zero-dp-pp.png)
+
+Here it's important to see how DP rank 0 doesn't see GPU2 and DP rank 1 doesn't see GPU3. To DP there is just GPUs 0 and 1 where it feeds data as if there were just 2 GPUs. GPU0 "secretly" offloads some of its load to GPU2 using PP. And GPU1 does the same by enlisting GPU3 to its aid.
+
+Since each dimension requires at least 2 GPUs, here you'd need at least 4 GPUs.
+
+Implementations:
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed)
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
+
+🤗 Transformers status: not yet implemented
+
+
+## DP+PP+TP
+
+To get an even more efficient training a 3D parallelism is used where PP is combined with TP and DP. This can be seen in the following diagram.
+
+![dp-pp-tp-3d](imgs/parallelism-deepspeed-3d.png)
+
+This diagram is from a blog post [3D parallelism: Scaling to trillion-parameter models](https://www.microsoft.com/en-us/research/blog/deepspeed-extreme-scale-model-training-for-everyone/), which is a good read as well.
+
+Since each dimension requires at least 2 GPUs, here you'd need at least 8 GPUs.
+
+Implementations:
+- [DeepSpeed](https://github.com/microsoft/DeepSpeed) - DeepSpeed also includes an even more efficient DP, which they call ZeRO-DP.
+- [Megatron-LM](https://github.com/NVIDIA/Megatron-LM)
+
+🤗 Transformers status: not yet implemented, since we have no PP and TP.
+
+
+## DP+PP+TP+ZeRO
+
+One of the main features of DeepSpeed is ZeRO, which is a super-scalable extension of DP. It has already been discussed in [ZeRO Data Parallel](#zero-data-parallel). Normally it's a standalone feature that doesn't require PP or TP. But it can be combined with PP and TP.
+
+When ZeRO-DP is combined with PP (and optinally TP) it typically enables only ZeRO stage 1 (optimizer sharding).
+
+While it's theoretically possible to use ZeRO stage 2 (gradient sharding) with Pipeline Parallelism, it will have bad performance impacts. There would need to be an additional reduce-scatter collective for every micro-batch to aggregate the gradients before sharding, which adds a potentially significant communication overhead. By nature of Pipeline Parallelism, small micro-batches are used and instead the focus is on trying to balance arithmetic intensity (micro-batch size) with minimizing the Pipeline bubble (number of micro-batches). Therefore those communication costs are going to hurt.
+
+In addition, There are already fewer layers than normal due to PP and so the memory savings won't be huge. PP already reduces gradient size by ``1/PP``, and so gradient sharding savings on top of that are less significant than pure DP.
+
+ZeRO stage 3 is not a good choice either for the same reason - more inter-node communications required.
+
+And since we have ZeRO, the other benefit is ZeRO-Offload. Since this is stage 1 optimizer states can be offloaded to CPU.
+
+Implementations:
+- [Megatron-DeepSpeed](https://github.com/microsoft/Megatron-DeepSpeed)
+
+🤗 Transformers status: not yet implemented, since we have no PP and TP.
+
+
+## FlexFlow
+
+[FlexFlow](https://github.com/flexflow/FlexFlow) also solves the parallelization problem in a slightly different approach.
+
+Paper: ["Beyond Data and Model Parallelism for Deep Neural Networks" by Zhihao Jia, Matei Zaharia, Alex Aiken](https://arxiv.org/abs/1807.05358)
+
+It performs a sort of 4D Parallelism over Sample-Operator-Attribute-Parameter.
+
+1. Sample = Data Parallelism
+2. Operator = part vertical Layer Parallelism, but it can split the layer too - more refined level
+3. Attribute = horizontal Model Parallelism (Megatron-LM style)
+4. Parameter = Sharded model params
+
+and they are working on Pipeline Parallelism. I guess ZeRO-DP is Sample+Parameter in this context.
+
+![flex-flow-soap](imgs/parallelism-flexflow.jpeg)
+
+The significance of this framework is that it takes resources like (1) GPU/TPU/CPU vs. (2) RAM/DRAM vs. (3) fast-intra-connect/slow-inter-connect and it automatically optimizes all these  algorithmically deciding which parallelisation to use where.
+
+One very important aspect is that FlexFlow is designed for optimizing DNN parallelizations for models with static and fixed workloads, since models with dynamic behavior may prefer different parallelization strategies across iterations.
+
+So the promise is very attractive - it runs a 30min simulation on the cluster of choice and it comes up with the best strategy to utilise this specific environment. If you add/remove/replace any parts it'll run and re-optimize the plan for that. And then you can train. A different setup will have its own custom optimization.
+
+🤗 Transformers status: not yet integrated. We already have our models FX-trace-able via [transformers.utils.fx](https://github.com/huggingface/transformers/blob/master/src/transformers/utils/fx.py), which is a prerequisite for FlexFlow, so someone needs to figure out what needs to be done to make FlexFlow work with our models.
+
+
+## Which Strategy To Use When
+
+Here is a very rough outlook at which parallelism strategy to use when. The first on the list is typically faster.
+
+**⇨ Single GPU**
+
+* Model fits onto a single GPU:
+
+    1. Normal use
+
+* Model doesn't fit onto a single GPU:
+
+    1. ZeRO + Offload CPU and optionally NVMe
+
+
+**⇨ Single Node / Multi-GPU**
+
+* Model fits onto a single GPU:
+
+    1. DDP - Distributed DP
+    2. ZeRO - may or may not be faster depending on the situation and configuration used
+
+* Model doesn't fit onto a single GPU:
+
+    1. PP
+    2. ZeRO
+    3. TP
+
+    With very fast intra-node connectivity of NVLINK or NVSwitch all three should be mostly on par, without these PP will be faster than TP and ZeRO. The degree of TP may also make a difference. Best to experiment to find the winner on your particular setup.
+
+
+**⇨ Multi-Node / Multi-GPU**
+
+* When you have fast inter-node connectivity:
+
+    1. ZeRO - as it requires close to no modifications to the model
+    2. PP+TP+DP - less communications, but requires massive changes to the model
+
+* when you have slow inter-node connectivity and still low on GPU memory:
+
+    1. DP+PP+TP+ZeRO-1
--- a/docs/source/performance.md
+++ b/docs/source/performance.md
@@ -226,6 +226,18 @@ pytorch `autocast` which performs AMP include a caching feature, which speed thi

 Autocast maintains a cache of the FP16 casts of model params (leaves). This helps streamline parameter reuse: if the same FP32 param is used in several different FP16list ops, like several matmuls, instead of re-casting the param to FP16 on entering each matmul, the cast will occur on the first matmul, the casted FP16 copy will be cached, and for all later matmuls the FP16 copy will be reused. The cache is maintained only within a particular outermost autocast context. When you exit the autocast context the cache is dropped. For recommended usage, in which autocast wraps the forward pass, and then you exit the context before calling backward(), this means the cache only lasts the duration of the forward pass each iteration, and will be rebuilt next iteration. (The cache of FP16-casted copies MUST be rebuilt each iteration. The FP32 params get updated by the optimizer, so the FP16 copies must be recreated, otherwise the FP16 values will be stale.)

+### Batch sizes
+
+One gets the most efficient performance when batch sizes and input/output neuron counts are divisible by a certain number, which typically starts at 8, but can be much higher as well. That number varies a lot depending on the specific hardware being used and the dtype of the model.
+
+For example for fully connected layers (which correspond to GEMMs), NVIDIA provides recommendations for [input/output neuron counts](
+https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#input-features) and [batch size](https://docs.nvidia.com/deeplearning/performance/dl-performance-fully-connected/index.html#batch-size).
+
+[Tensor Core Requirements](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#requirements-tc) define the multiplier based on the dtype and the hardware. For example, for fp16 a multiple of 8 is recommended, but on A100 it's 64!
+
+For parameters that are small, there is also [Dimension Quantization Effects](https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html#dim-quantization) to consider, this is where tiling happens and the right multiplier can have a significant speedup.
+
+
 ### DP vs DDP

 `DistributedDataParallel` (DDP) is typically faster than `DataParallel` (DP), but it is not always the case:
--- a/docs/source/sagemaker.md
+++ b/docs/source/sagemaker.md
@@ -16,388 +16,10 @@ limitations under the License.

 # Run training on Amazon SageMaker

-Hugging Face and Amazon are introducing new [Hugging Face Deep Learning Containers (DLCs)](#deep-learning-container-dlc-overview) to make it easier than ever to train Hugging Face Transformer models in [Amazon SageMaker](https://aws.amazon.com/sagemaker/).
+The documentation has been moved to [hf.co/docs/sagemaker](https://huggingface.co/docs/sagemaker). This page will be removed in `transformers` 5.0. 

-You can find a full list of all available [Hugging Face Deep Learning Containers](#deep-learning-container-dlc-overview) at the end of this page. 
+### Table of Content

-To learn how to access and use the new Hugging Face DLCs with the Amazon SageMaker Python SDK, check out the guides and resources below.
-
---
-
-
-## Getting Started: Train a 🤗 Transformers Model
-
-To train a 🤗 Transformers model by using the `HuggingFace` SageMaker Python SDK you need to:
-
- [Prepare a training script](#prepare-a-transformers-fine-tuning-script)
- [Create a `HuggingFace` Estimator](#create-an-huggingface-estimator)
- [Run training by calling the `fit` method](#execute-training)
- [Access you model](#access-trained-model)
-
-### Setup & Installation
-
-Before you can train a transformers models with Amazon SageMaker you need to sign up for an AWS account. If you do not have an AWS account yet learn more [here](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-set-up.html).
-
-After you complete these tasks you can get started using either [SageMaker Studio](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-studio-onboard.html), [SageMaker Notebook Instances](https://docs.aws.amazon.com/sagemaker/latest/dg/gs-console.html), or a local environment. To start training locally you need configure the right [IAM permission](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html).
-
-Upgrade to the latest `sagemaker` version.
-
-```bash
-pip install sagemaker --upgrade
-```
-
-**SageMaker environment**
-
-_Note: The execution role is intended to be available only when running a notebook within SageMaker. If you run `get_execution_role` in a notebook not on SageMaker, expect a "region" error._
-
-```python
-import sagemaker
-sess = sagemaker.Session()
-role = sagemaker.get_execution_role()
-```
-
-**Local environment**
-
-```python
-import sagemaker
-import boto3
-
-iam_client = boto3.client('iam')
-role = iam_client.get_role(RoleName='role-name-of-your-iam-role-with-right-permissions')['Role']['Arn']
-sess = sagemaker.Session()
-```
-
-### Prepare a 🤗 Transformers fine-tuning script.
-
-The training script is very similar to a training script you might run outside of SageMaker, but you can access useful properties about the training environment through various environment variables, including the following:
-
- `SM_MODEL_DIR`: A string that represents the path where the training job writes the model artifacts to. After training, artifacts in this directory are uploaded to S3 for model hosting. `SM_MODEL_DIR` is always set to `/opt/ml/model`.
-
- `SM_NUM_GPUS`: An integer representing the number of GPUs available to the host.
-
- `SM_CHANNEL_XXXX:` A string that represents the path to the directory that contains the input data for the specified channel. For example, if you specify two input channels in the HuggingFace estimator’s fit call, named `train` and `test`, the environment variables `SM_CHANNEL_TRAIN` and `SM_CHANNEL_TEST` are set.
-
-You can find a full list of the exposed environment variables [here](https://github.com/aws/sagemaker-training-toolkit/blob/master/ENVIRONMENT_VARIABLES.md).
-
-Later we define `hyperparameters` in the [HuggingFace Estimator](#create-an-huggingface-estimator), which are passed in as named arguments and and can be processed with the `ArgumentParser()`.
-
-```python
-import transformers
-import datasets
-import argparse
-import os
-
-if __name__ == "__main__":
-
-    parser = argparse.ArgumentParser()
-
-    # hyperparameters sent by the client are passed as command-line arguments to the script.
-    parser.add_argument("--epochs", type=int, default=3)
-    parser.add_argument("--per_device_train_batch_size", type=int, default=32)
-    parser.add_argument("--model_name_or_path", type=str)
-
-    # Data, model, and output directories
-    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
-    parser.add_argument("--training_dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
-    parser.add_argument("--test_dir", type=str, default=os.environ["SM_CHANNEL_TEST"])
-```
-
-_Note that SageMaker doesn’t support argparse actions. For example, if you want to use a boolean hyperparameter, specify `type` as `bool` in your script and provide an explicit `True` or `False` value._
-
-For a complete example of a 🤗 Transformers training script, see [train.py](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/scripts/train.py)
-
-### Create an HuggingFace Estimator
-
-You run 🤗 Transformers training scripts on SageMaker by creating `HuggingFace` Estimators. The Estimator handles end-to-end Amazon SageMaker training. The training of your script is invoked when you call `fit` on a `HuggingFace` Estimator. In the Estimator you define, which fine-tuning script should be used as `entry_point`, which `instance_type` should be used, which `hyperparameters` are passed in, you can find all possible `HuggingFace` Parameter [here](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/sagemaker.huggingface.html#huggingface-estimator). and an example of a fine-tuning script [here](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/scripts/train.py).
-You can find all useable `instance_types` [here](https://aws.amazon.com/de/sagemaker/pricing/).
-
-The following code sample shows how you train a custom `HuggingFace` script `train.py`, passing in three hyperparameters (`epochs`, `per_device_train_batch_size`, and `model_name_or_path`).
-
-```python
-from sagemaker.huggingface import HuggingFace
-
-
-# hyperparameters, which are passed into the training job
-hyperparameters={'epochs': 1,
-                 'per_device_train_batch_size': 32,
-                 'model_name_or_path': 'distilbert-base-uncased'
-                 }
-
-# create the Estimator
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3.2xlarge',
-        instance_count=1,
-        role=role,
-        transformers_version='4.4',
-        pytorch_version='1.6',
-        py_version='py36',
-        hyperparameters = hyperparameters
-)
-```
-
-To run the `TrainingJob` locally you can define `instance_type='local'` or `instance_type='local-gpu'` for gpu usage. _Note: this does not working within SageMaker Studio_
-
-### Execute Training
-
-You start your `TrainingJob` by calling `fit` on a `HuggingFace` Estimator. In the `fit` method you specify your input training data, like a string S3 URI `s3://my-bucket/my-training-data` or a `FileSystemInput` for [EFS or FSx Lustre](https://sagemaker.readthedocs.io/en/stable/overview.html?highlight=FileSystemInput#use-file-systems-as-training-inputs), see [here](https://sagemaker.readthedocs.io/en/stable/overview.html?highlight=FileSystemInput#use-file-systems-as-training-inputs).
-
-```python
-huggingface_estimator.fit(
-  {'train': 's3://sagemaker-us-east-1-558105141721/samples/datasets/imdb/train',
-   'test': 's3://sagemaker-us-east-1-558105141721/samples/datasets/imdb/test'}
-)
-
-```
-
-SageMaker takes care of starting and managing all the required ec2 instances for ands starts the training job by running.
-
-```bash
-/opt/conda/bin/python train.py --epochs 1 --model_name_or_path distilbert-base-uncased --per_device_train_batch_size 32
-```
-
-### Access trained model
-
-After training is done you can access your model either through the [AWS console](https://console.aws.amazon.com/console/home?nc2=h_ct&src=header-signin) or downloading it directly from S3.
-
-```python
-from sagemaker.s3 import S3Downloader
-
-S3Downloader.download(
-    s3_uri=huggingface_estimator.model_data, # s3 uri where the trained model is located
-    local_path='.', # local path where *.targ.gz is saved
-    sagemaker_session=sess # sagemaker session used for training the model
-)
-```
-
---
-
-## Sample Notebooks
-
-You can find here a list of the official notebooks provided by Hugging Face.
-
-| Notebook                                                                                                                                                                                        | Description                                                                                                      |
-| ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------- |
-| [Getting Started Pytorch](https://github.com/huggingface/notebooks/blob/master/sagemaker/01_getting_started_pytorch/sagemaker-notebook.ipynb)                                                   | End-to-End binary Text-Classification example using `Trainer` and `imdb` dataset                                 |
-| [Getting Started Tensorflow](https://github.com/huggingface/notebooks/blob/master/sagemaker/02_getting_started_tensorflow/sagemaker-notebook.ipynb)                                             | End-to-End binary Text-Classification example using `Keras` and `imdb` dataset                                   |
-| [Distributed Training Data Parallelism](https://github.com/huggingface/notebooks/blob/master/sagemaker/03_distributed_training_data_parallelism/sagemaker-notebook.ipynb)                       | End-to-End distributed Question-Answering example using `Trainer` and 🤗 Transformers example script for `SQAuD` |
-| [Distributed Training Model Parallelism](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)                     | End-to-End model parallelism example using `SageMakerTrainer` and `run_glue.py` script                           |
-| [Spot Instances and continues training](https://github.com/huggingface/notebooks/blob/master/sagemaker/05_spot_instances/sagemaker-notebook.ipynb)                                              | End-to-End to Text-Classification example using spot instances with continued training.                          |
-| [SageMaker Metrics](https://github.com/huggingface/notebooks/blob/master/sagemaker/06_sagemaker_metrics/sagemaker-notebook.ipynb)                                                               | End-to-End to Text-Classification example using SageMaker Metrics to extract and log metrics during training     |
-| [Distributed Training Data Parallelism Tensorflow](https://github.com/huggingface/notebooks/blob/master/sagemaker/07_tensorflow_distributed_training_data_parallelism/sagemaker-notebook.ipynb) | End-to-End distributed binary Text-Classification example using `Keras` and `TensorFlow`                    
-| [Distributed Seq2Seq Training with Data Parallelism and BART](https://github.com/huggingface/notebooks/blob/master/sagemaker/08_distributed_summarization_bart_t5/sagemaker-notebook.ipynb) | End-to-End distributed summarization example with `BART-large` and 🤗 Transformers example script for `summarization`                        |
-| [Image Classification using Vision Transformer](https://github.com/huggingface/notebooks/blob/master/sagemaker/09_image_classification_vision_transformer/sagemaker-notebook.ipynb) | End-to-End image classification example with `Vision Transformers`                 |
-
---
-
-## Advanced Features
-
-In addition to the Deep Learning Container and the SageMaker SDK, we have implemented other additional features.
-
-### Distributed Training: Data-Parallel
-
-You can use [SageMaker Data Parallelism Library](https://aws.amazon.com/blogs/aws/managed-data-parallelism-in-amazon-sagemaker-simplifies-training-on-large-datasets/) out of the box for distributed training. We added the functionality of Data Parallelism directly into the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html). If your `train.py` uses the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) API you only need to define the distribution parameter in the HuggingFace Estimator.
-
- [Example Notebook PyTorch](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)
- [Example Notebook TensorFlow](https://github.com/huggingface/notebooks/blob/master/sagemaker/07_tensorflow_distributed_training_data_parallelism/sagemaker-notebook.ipynb)
-
-```python
-# configuration for running training on smdistributed Data Parallel
-distribution = {'smdistributed':{'dataparallel':{ 'enabled': True }}}
-
-# create the Estimator
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3dn.24xlarge',
-        instance_count=2,
-        role=role,
-        transformers_version='4.4.2',
-        pytorch_version='1.6.0',
-        py_version='py36',
-        hyperparameters = hyperparameters
-        distribution = distribution
-)
-
-```
-
-### Distributed Training: Model-Parallel
-
-You can use [SageMaker Model Parallelism Library](https://aws.amazon.com/blogs/aws/amazon-sagemaker-simplifies-training-deep-learning-models-with-billions-of-parameters/) out of the box for distributed training. We added the functionality of Model Parallelism directly into the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html). If your `train.py` uses the [Trainer](https://huggingface.co/transformers/main_classes/trainer.html) API you only need to define the distribution parameter in the HuggingFace Estimator.  
-For detailed information about the adjustments take a look [here](https://sagemaker.readthedocs.io/en/stable/api/training/smd_model_parallel_general.html?highlight=modelparallel#required-sagemaker-python-sdk-parameters).
-
-
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/04_distributed_training_model_parallelism/sagemaker-notebook.ipynb)
-
-
-```python
-# configuration for running training on smdistributed Model Parallel
-mpi_options = {
-    "enabled" : True,
-    "processes_per_host" : 8
-}
-
-smp_options = {
-    "enabled":True,
-    "parameters": {
-        "microbatches": 4,
-        "placement_strategy": "spread",
-        "pipeline": "interleaved",
-        "optimize": "speed",
-        "partitions": 4,
-        "ddp": True,
-    }
-}
-
-distribution={
-    "smdistributed": {"modelparallel": smp_options},
-    "mpi": mpi_options
-}
-
- # create the Estimator
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3dn.24xlarge',
-        instance_count=2,
-        role=role,
-        transformers_version='4.4.2',
-        pytorch_version='1.6.0',
-        py_version='py36',
-        hyperparameters = hyperparameters,
-        distribution = distribution
-)
-```
-
-### Spot Instances
-
-With the creation of HuggingFace Framework extension for the SageMaker Python SDK we can also leverage the benefit of [fully-managed EC2 spot instances](https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html) and save up to 90% of our training cost.
-
-_Note: Unless your training job completes quickly, we recommend you use [checkpointing](https://docs.aws.amazon.com/sagemaker/latest/dg/model-checkpoints.html) with managed spot training, therefore you need to define the `checkpoint_s3_uri`._
-
-To use spot instances with the `HuggingFace` Estimator we have to set the `use_spot_instances` parameter to `True` and define your `max_wait` and `max_run` time. You can read more about the [managed spot training lifecycle here](https://docs.aws.amazon.com/sagemaker/latest/dg/model-managed-spot-training.html).
-
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/05_spot_instances/sagemaker-notebook.ipynb)
-
-```python
-# hyperparameters, which are passed into the training job
-hyperparameters={'epochs': 1,
-                 'train_batch_size': 32,
-                 'model_name':'distilbert-base-uncased',
-                 'output_dir':'/opt/ml/checkpoints'
-                 }
-# create the Estimator
-
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3.2xlarge',
-        instance_count=1,
-	    checkpoint_s3_uri=f's3://{sess.default_bucket()}/checkpoints'
-        use_spot_instances=True,
-        max_wait=3600, # This should be equal to or greater than max_run in seconds'
-        max_run=1000,
-        role=role,
-        transformers_version='4.4',
-        pytorch_version='1.6',
-        py_version='py36',
-        hyperparameters = hyperparameters
-)
-
-# Training seconds: 874
-# Billable seconds: 262
-# Managed Spot Training savings: 70.0%
-
-```
-
-### Git Repository
-
-When you create a `HuggingFace` Estimator, you can specify a [training script that is stored in a GitHub repository](https://sagemaker.readthedocs.io/en/stable/overview.html#use-scripts-stored-in-a-git-repository) as the entry point for the estimator, so that you don’t have to download the scripts locally. If Git support is enabled, the `entry_point` and `source_dir` should be relative paths in the Git repo if provided. 
-
-If you are using `git_config` to run the [🤗 Transformers examples scripts](https://github.com/huggingface/transformers/tree/master/examples) keep in mind that you need to configure the right `'branch'` for you `transformers_version`, e.g. if you use `transformers_version='4.4.2` you have to use `'branch':'v4.4.2'`. 
-
-As an example to use `git_config` with an [example script from the transformers repository](https://github.com/huggingface/transformers/tree/master/examples/pytorch/text-classification).
-
-_Tip: define `output_dir` as `/opt/ml/model` in the hyperparameter for the script to save your model to S3 after training._
-
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/02_getting_started_tensorflow/sagemaker-notebook.ipynb)
-
-```python
-# configure git settings
-git_config = {'repo': 'https://github.com/huggingface/transformers.git','branch': 'v4.4.2'} # v4.4.2 is referring to the `transformers_version you use in the estimator.
-
- # create the Estimator
-huggingface_estimator = HuggingFace(
-        entry_point='run_glue.py',
-        source_dir='./examples/pytorch/text-classification',
-        git_config=git_config,
-        instance_type='ml.p3.2xlarge',
-        instance_count=1,
-        role=role,
-        transformers_version='4.4',
-        pytorch_version='1.6',
-        py_version='py36',
-        hyperparameters=hyperparameters
-)
-
-```
-
-### SageMaker Metrics
-
-[SageMaker Metrics](https://docs.aws.amazon.com/sagemaker/latest/dg/training-metrics.html#define-train-metrics) can automatically parse the logs for metrics and send those metrics to CloudWatch. If you want SageMaker to parse logs you have to specify the metrics that you want SageMaker to send to CloudWatch when you configure the training job. You specify the name of the metrics that you want to send and the regular expressions that SageMaker uses to parse the logs that your algorithm emits to find those metrics.
-
- [Example Notebook](https://github.com/huggingface/notebooks/blob/master/sagemaker/06_sagemaker_metrics/sagemaker-notebook.ipynb)
-
-```python
-# define metrics definitions
-
-metric_definitions = [
-{"Name": "train_runtime", "Regex": "train_runtime.*=\D*(.*?)$"},
-{"Name": "eval_accuracy", "Regex": "eval_accuracy.*=\D*(.*?)$"},
-{"Name": "eval_loss", "Regex": "eval_loss.*=\D*(.*?)$"},
-]
-
-# create the Estimator
-
-huggingface_estimator = HuggingFace(
-        entry_point='train.py',
-        source_dir='./scripts',
-        instance_type='ml.p3.2xlarge',
-        instance_count=1,
-        role=role,
-        transformers_version='4.4',
-        pytorch_version='1.6',
-        py_version='py36',
-        metric_definitions=metric_definitions,
-        hyperparameters = hyperparameters)
-
-```
-
-
-## Deep Learning Container (DLC) overview
-
-The Deep Learning Container are in every available where Amazon SageMaker is available. You can see the [AWS region table](https://aws.amazon.com/about-aws/global-infrastructure/regional-product-services/) for all AWS global infrastructure. To get an detailed overview of all included packages look [here in the release notes](https://docs.aws.amazon.com/deep-learning-containers/latest/devguide/deep-learning-containers-images.html).
-
-| 🤗 Transformers version | 🤗 Datasets version | PyTorch/TensorFlow version | type     | device | Python Version | Example `image_uri`                                                                                                               |
-| ----------------------- | ------------------- | -------------------------- | -------- | ------ | -------------- | --------------------------------------------------------------------------------------------------------------------------------- |
-| 4.4.2                   | 1.5.0               | PyTorch 1.6.0              | training | GPU    | 3.6            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04`    |
-| 4.4.2                   | 1.5.0               | TensorFlow 2.4.1           | training | GPU    | 3.7            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.4.2-gpu-py37-cu110-ubuntu18.04` |
-| 4.5.0                   | 1.5.0               | PyTorch 1.6.0              | training | GPU    | 3.6            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.4.2-gpu-py36-cu110-ubuntu18.04`    |
-| 4.5.0                   | 1.5.0               | TensorFlow 2.4.1           | training | GPU    | 3.7            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.5.0-gpu-py37-cu110-ubuntu18.04` |
-| 4.6.1                   | 1.6.2               | PyTorch 1.6.0              | training | GPU    | 3.6            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.6.0-transformers4.5.0-gpu-py36-cu110-ubuntu18.04`    |
-| 4.6.1                   | 1.6.2               | PyTorch 1.7.1               | training | GPU    | 3.6            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-pytorch-training:1.7.1-transformers4.6.1-gpu-py36-cu110-ubuntu18.04` |
-| 4.6.1                   | 1.6.2               | TensorFlow 2.4.1           | training | GPU    | 3.7            | `763104351884.dkr.ecr.us-west-2.amazonaws.com/huggingface-tensorflow-training:2.4.1-transformers4.6.1-gpu-py37-cu110-ubuntu18.04` |
-
---
-
-
-## Additional Resources
-
- [Announcement Blog Post](https://huggingface.co/blog/the-partnership-amazon-sagemaker-and-hugging-face)
-
- [AWS and Hugging Face collaborate to simplify and accelerate adoption of natural language processing](https://aws.amazon.com/blogs/machine-learning/aws-and-hugging-face-collaborate-to-simplify-and-accelerate-adoption-of-natural-language-processing-models/)
-
- [Amazon SageMaker documentation for Hugging Face](https://docs.aws.amazon.com/sagemaker/latest/dg/hugging-face.html)
-
- [SageMaker Python SDK](https://sagemaker.readthedocs.io/en/stable/frameworks/huggingface/index.html)
+- [Train Hugging Face models on Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/train)
+- [Deploy Hugging Face models to Amazon SageMaker with the SageMaker Python SDK](https://huggingface.co/docs/sagemaker/inference)
+- [Frequently Asked Questions](https://huggingface.co/docs/sagemaker/faq)
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -21,11 +21,165 @@ Projects `ONNX (Open Neural Network eXchange) <http://onnx.ai>`_ and `ONNXRuntim
 unified and community-driven format to store and, by extension, efficiently execute neural network leveraging a variety
 of hardware and dedicated optimizations.

+
 Starting from transformers v2.10.0 we partnered with ONNX Runtime to provide an easy export of transformers models to
 the ONNX format. You can have a look at the effort by looking at our joint blog post `Accelerate your NLP pipelines
 using Hugging Face Transformers and ONNX Runtime
 <https://medium.com/microsoftazure/accelerate-your-nlp-pipelines-using-hugging-face-transformers-and-onnx-runtime-2443578f4333>`_.

+
+Configuration-based approach
+-----------------------------------------------------------------------------------------------------------------------
+
+Transformers v4.9.0 introduces a new package: ``transformers.onnx``. This package allows converting checkpoints to an
+ONNX graph by leveraging configuration objects. These configuration objects come ready made for a number of model
+architectures, and are made to be easily extendable to other architectures.
+
+Ready-made configurations include the following models:
+
+- ALBERT
+- BART
+- BERT
+- DistilBERT
+- GPT-2
+- RoBERTa
+- T5
+- XLM-RoBERTa
+
+This conversion is handled with the PyTorch version of models - it, therefore, requires PyTorch to be installed. If you
+would like to be able to convert from TensorFlow, please let us know by opening an issue.
+
+.. note::
+    The models showcased here are close to fully feature complete, but do lack some features that are currently in
+    development. Namely, the ability to handle the past key values for decoder models is currently in the works.
+
+
+Converting an ONNX model using the ``transformers.onnx`` package
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The package may be used as a Python module:
+
+.. code-block::
+
+    python -m transformers.onnx --help
+
+    usage: Hugging Face ONNX Exporter tool [-h] -m MODEL -f {pytorch} [--features {default}] [--opset OPSET] [--atol ATOL] output
+
+    positional arguments:
+      output                Path indicating where to store generated ONNX model.
+
+    optional arguments:
+      -h, --help            show this help message and exit
+      -m MODEL, --model MODEL
+                            Model's name of path on disk to load.
+      --features {default}  Export the model with some additional features.
+      --opset OPSET         ONNX opset version to export the model with (default 12).
+      --atol ATOL           Absolute difference tolerance when validating the model.
+
+Exporting a checkpoint using a ready-made configuration can be done as follows:
+
+.. code-block::
+
+    python -m transformers.onnx --model=bert-base-cased onnx/bert-base-cased/
+
+This exports an ONNX graph of the mentioned checkpoint. Here it is `bert-base-cased`, but it can be any model from the
+hub, or a local path.
+
+It will be exported under ``onnx/bert-base-cased``. You should see similar logs:
+
+.. code-block::
+
+    Validating ONNX model...
+            -[✓] ONNX model outputs' name match reference model ({'pooler_output', 'last_hidden_state'}
+            - Validating ONNX Model output "last_hidden_state":
+                    -[✓] (2, 8, 768) matchs (2, 8, 768)
+                    -[✓] all values close (atol: 0.0001)
+            - Validating ONNX Model output "pooler_output":
+                    -[✓] (2, 768) matchs (2, 768)
+                    -[✓] all values close (atol: 0.0001)
+    All good, model saved at: onnx/bert-base-cased/model.onnx
+
+This export can now be used in the ONNX inference runtime:
+
+.. code-block::
+
+    import onnxruntime as ort
+
+    from transformers import BertTokenizerFast
+    tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")
+
+    ort_session = ort.InferenceSession("onnx/bert-base-cased/model.onnx")
+
+    inputs = tokenizer("Using BERT in ONNX!", return_tensors="np")
+    outputs = ort_session.run(["last_hidden_state", "pooler_output"], dict(inputs))
+
+The outputs used (:obj:`["last_hidden_state", "pooler_output"]`) can be obtained by taking a look at the ONNX
+configuration of each model. For example, for BERT:
+
+.. code-block::
+
+    from transformers.models.bert import BertOnnxConfig, BertConfig
+
+    config = BertConfig()
+    onnx_config = BertOnnxConfig(config)
+    output_keys = list(onnx_config.outputs.keys())
+
+Implementing a custom configuration for an unsupported architecture
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Let's take a look at the changes necessary to add a custom configuration for an unsupported architecture. Firstly, we
+will need a custom ONNX configuration object that details the model inputs and outputs. The BERT ONNX configuration is
+visible below:
+
+.. code-block::
+
+    class BertOnnxConfig(OnnxConfig):
+        @property
+        def inputs(self) -> Mapping[str, Mapping[int, str]]:
+            return OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "sequence"}),
+                    ("attention_mask", {0: "batch", 1: "sequence"}),
+                    ("token_type_ids", {0: "batch", 1: "sequence"}),
+                ]
+            )
+
+        @property
+        def outputs(self) -> Mapping[str, Mapping[int, str]]:
+            return OrderedDict([("last_hidden_state", {0: "batch", 1: "sequence"}), ("pooler_output", {0: "batch"})])
+
+Let's understand what's happening here. This configuration has two properties: the inputs, and the outputs.
+
+The inputs return a dictionary, where each key corresponds to an expected input, and each value indicates the axis of
+that input.
+
+For BERT, there are three necessary inputs. These three inputs are of similar shape, which is made up of two
+dimensions: the batch is the first dimension, and the second is the sequence.
+
+The outputs return a similar dictionary, where, once again, each key corresponds to an expected output, and each value
+indicates the axis of that output.
+
+Once this is done, a single step remains: adding this configuration object to the initialisation of the model class,
+and to the general ``transformers`` initialisation.
+
+An important fact to notice is the use of `OrderedDict` in both inputs and outputs properties. This is a requirements
+as inputs are matched against their relative position within the `PreTrainedModel.forward()` prototype and outputs are
+match against there position in the returned `BaseModelOutputX` instance.
+
+An example of such an addition is visible here, for the MBart model: `Making MBART ONNX-convertible
+<https://github.com/huggingface/transformers/pull/13049/commits/d097adcebd89a520f04352eb215a85916934204f>`__
+
+If you would like to contribute your addition to the library, we recommend you implement tests. An example of such
+tests is visible here: `Adding tests to the MBART ONNX conversion
+<https://github.com/huggingface/transformers/pull/13049/commits/5d642f65abf45ceeb72bd855ca7bfe2506a58e6a>`__
+
+Graph conversion
+-----------------------------------------------------------------------------------------------------------------------
+
+.. note::
+    The approach detailed here is bing deprecated. We recommend you follow the part above for an up to date approach.
+
+
 Exporting a model is done through the script `convert_graph_to_onnx.py` at the root of the transformers sources. The
 following command shows how easy it is to export a BERT model from the library, simply run:

--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -1170,6 +1170,23 @@ To start a debugger at the point of the warning, do this:
    pytest tests/test_logging.py -W error::UserWarning --pdb


+Working with github actions workflows
+-----------------------------------------------------------------------------------------------------------------------
+
+To trigger a self-push workflow CI job, you must:
+
+1. Create a new branch on ``transformers`` origin (not a fork!).
+2. The branch name has to start with either ``ci_`` or ``ci-`` (``master`` triggers it too, but we can't do PRs on
+   ``master``). It also gets triggered only for specific paths - you can find the up-to-date definition in case it
+   changed since this document has been written `here
+   <https://github.com/huggingface/transformers/blob/master/.github/workflows/self-push.yml>`__ under `push:`
+3. Create a PR from this branch.
+4. Then you can see the job appear `here
+   <https://github.com/huggingface/transformers/actions/workflows/self-push.yml>`__. It may not run right away if there
+   is a backlog.
+
+
+

 Testing Experimental CI Features
 -----------------------------------------------------------------------------------------------------------------------
--- a/docs/source/training.rst
+++ b/docs/source/training.rst
@@ -152,7 +152,7 @@ To fine-tune our model, we just need to call
    trainer.train()

 which will start a training that you can follow with a progress bar, which should take a couple of minutes to complete
-(as long as you hav access to a GPU). It won't actually tell you anything useful about how well (or badly) your model
+(as long as you have access to a GPU). It won't actually tell you anything useful about how well (or badly) your model
 is performing however as by default, there is no evaluation during training, and we didn't tell the
 :class:`~transformers.Trainer` to compute any metrics. Let's have a look on how to do that now!

--- a/examples/flax/README.md
+++ b/examples/flax/README.md
@@ -19,6 +19,17 @@ This folder contains actively maintained examples of 🤗 Transformers using the

 *NOTE*: Currently, there is no "Trainer" abstraction for JAX/Flax -- all examples contain an explicit training loop.

+The following table lists all of our examples on how to use 🤗 Transformers with the JAX/Flax backend:
+- with information about the model and dataset used,
+- whether or not they leverage the [🤗 Datasets](https://github.com/huggingface/datasets) library,
+- links to **Colab notebooks** to walk through the scripts and run them easily.
+
+| Task | Example model | Example dataset | 🤗 Datasets | Colab
+|---|---|---|:---:|:---:|
+| [**`causal-language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling) | GPT2 | OSCAR | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/causal_language_modeling_flax.ipynb)
+| [**`masked-language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling) | RoBERTa | OSCAR | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/masked_language_modeling_flax.ipynb)
+| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/flax/text-classification) | BERT | GLUE | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification_flax.ipynb)
+
 ## Intro: JAX and Flax

 [JAX](https://github.com/google/jax) is a numerical computation library that exposes a NumPy-like API with tracing capabilities. With JAX's `jit`, you can
@@ -47,17 +58,4 @@ be adding a guide for porting models from PyTorch in the upcoming few weeks.
 For a complete overview of models that are supported in JAX/Flax, please have a look at [this](https://huggingface.co/transformers/master/index.html#supported-frameworks) table.

 Over 3000 pretrained checkpoints are supported in JAX/Flax as of May 2021.
-Click [here](https://huggingface.co/models?filter=jax) to see the full list on the 🤗 hub. 
-
-## Examples
-
-The following table lists all of our examples on how to use 🤗 Transformers with the JAX/Flax backend:
- with information about the model and dataset used,
- whether or not they leverage the [🤗 Datasets](https://github.com/huggingface/datasets) library,
- links to **Colab notebooks** to walk through the scripts and run them easily.
-
-| Task | Example model | Example dataset | 🤗 Datasets | Colab
-|---|---|---|:---:|:---:|
-| [**`causal-language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling) | GPT2 | OSCAR | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/causal_language_modeling_flax.ipynb)
-| [**`masked-language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling) | RoBERTa | OSCAR | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/masked_language_modeling_flax.ipynb)
-| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/flax/text-classification) | BERT | GLUE | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification_flax.ipynb)
+Click [here](https://huggingface.co/models?filter=jax) to see the full list on the 🤗 hub.
--- a/examples/flax/language-modeling/README.md
+++ b/examples/flax/language-modeling/README.md
@@ -33,11 +33,37 @@ in Norwegian on a single TPUv3-8 pod.

 The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.

-Let's start by creating a folder to save the trained model and a symbolic link to the `run_mlm_flax.py` script.
+Let's start by creating a model repository to save the trained model and logs.
+Here we call the model `"norwegian-roberta-base"`, but you can change the model name as you like.
+
+You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
+you are logged in) or via the command line:
+
+```
+huggingface-cli repo create norwegian-roberta-base
+```
+
+Next we clone the model repository to add the tokenizer and model files.
+
+```
+git clone https://huggingface.co/<your-username>/norwegian-roberta-base
+```
+
+To ensure that all tensorboard traces will be uploaded correctly, we need to 
+track them. You can run the following command inside your model repo to do so.
+
+```
+cd norwegian-roberta-base
+git lfs track "*tfevents*"
+```
+
+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
+Next, let's add a symbolic link to the `run_mlm_flax.py`.

 ```bash
 export MODEL_DIR="./norwegian-roberta-base"
-mkdir -p ${MODEL_DIR}
 ln -s ~/transformers/examples/flax/language-modeling/run_mlm_flax.py run_mlm_flax.py
 ```

@@ -88,7 +114,7 @@ from transformers import RobertaConfig

 model_dir = "./norwegian-roberta-base"  # ${MODEL_DIR}

-config = RobertaConfig.from_pretrained("roberta-base")
+config = RobertaConfig.from_pretrained("roberta-base", vocab_size=tokenizer.vocab_size)
 config.save_pretrained(model_dir)
 ```

@@ -98,7 +124,7 @@ Next we can run the example script to pretrain the model:

 ```bash
 ./run_mlm_flax.py \
-    --output_dir="./runs" \
+    --output_dir="${MODEL_DIR}" \
    --model_type="roberta" \
    --config_name="${MODEL_DIR}" \
    --tokenizer_name="${MODEL_DIR}" \
@@ -111,10 +137,13 @@ Next we can run the example script to pretrain the model:
    --learning_rate="3e-4" \
    --warmup_steps="1000" \
    --overwrite_output_dir \
-    --pad_to_max_length \
    --num_train_epochs="18" \
    --adam_beta1="0.9" \
-    --adam_beta2="0.98"
+    --adam_beta2="0.98" \
+    --logging_steps="500" \
+    --save_steps="2500" \
+    --eval_steps="2500" \
+    --push_to_hub
 ```

 Training should converge at a loss and accuracy 
@@ -135,11 +164,37 @@ in Norwegian on a single TPUv3-8 pod.

 The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.

-Let's start by creating a folder to save the trained model and a symbolic link to the `run_clm_flax.py` script.
+Let's start by creating a model repository to save the trained model and logs.
+Here we call the model `"norwegian-gpt2"`, but you can change the model name as you like.
+
+You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
+you are logged in) or via the command line:
+
+```
+huggingface-cli repo create norwegian-gpt2
+```
+
+Next we clone the model repository to add the tokenizer and model files.
+
+```
+git clone https://huggingface.co/<your-username>/norwegian-gpt2
+```
+
+To ensure that all tensorboard traces will be uploaded correctly, we need to 
+track them. You can run the following command inside your model repo to do so.
+
+```
+cd norwegian-gpt2
+git lfs track "*tfevents*"
+```
+
+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
+Next, let's add a symbolic link to the `run_clm_flax.py`.

 ```bash
 export MODEL_DIR="./norwegian-gpt2"
-mkdir -p ${MODEL_DIR}
 ln -s ~/transformers/examples/flax/language-modeling/run_clm_flax.py run_clm_flax.py
 ```

@@ -156,7 +211,7 @@ from transformers import GPT2Config

 model_dir = "./norwegian-gpt2"  # ${MODEL_DIR}

-config = GPT2Config.from_pretrained("gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0)
+config = GPT2Config.from_pretrained("gpt2", resid_pdrop=0.0, embd_pdrop=0.0, attn_pdrop=0.0, vocab_size=tokenizer.vocab_size)
 config.save_pretrained(model_dir)
 ```

@@ -166,7 +221,7 @@ Next we can run the example script to pretrain the model:

 ```bash
 ./run_clm_flax.py \
-    --output_dir="./runs" \
+    --output_dir="${MODEL_DIR}" \
    --model_type="gpt2" \
    --config_name="${MODEL_DIR}" \
    --tokenizer_name="${MODEL_DIR}" \
@@ -180,6 +235,10 @@ Next we can run the example script to pretrain the model:
    --adam_beta1="0.9" --adam_beta2="0.98" --weight_decay="0.01" \
    --overwrite_output_dir \
    --num_train_epochs="20" \
+    --logging_steps="500" \
+    --save_steps="2500" \
+    --eval_steps="2500" \
+    --push_to_hub
 ```

 Training should converge at a loss and perplexity 
@@ -187,6 +246,143 @@ of 3.24 and 25.72 respectively after 20 epochs on a single TPUv3-8.
 This should take less than ~21 hours.
 Training statistics can be accessed on [tfhub.de](https://tensorboard.dev/experiment/2zEhLwJ0Qp2FAkI3WVH9qA).

+## T5-like span-masked language modeling
+
+In the following, we demonstrate how to train a T5 model using the span-masked language model 
+objective as proposed in the [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683).
+More specifically, we demonstrate how JAX/Flax can be leveraged 
+to pre-train [**`google/t5-v1_1-base`**](https://huggingface.co/google/t5-v1_1-base)
+in Norwegian on a single TPUv3-8 pod.
+
+The example script uses the 🤗 Datasets library. You can easily customize them to your needs if you need extra processing on your datasets.
+
+Let's start by creating a model repository to save the trained model and logs.
+Here we call the model `"norwegian-t5-base"`, but you can change the model name as you like.
+
+You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
+you are logged in) or via the command line:
+
+```
+huggingface-cli repo create norwegian-t5-base
+```
+
+Next we clone the model repository to add the tokenizer and model files.
+
+```
+git clone https://huggingface.co/<your-username>/norwegian-t5-base
+```
+
+To ensure that all tensorboard traces will be uploaded correctly, we need to 
+track them. You can run the following command inside your model repo to do so.
+
+```
+cd norwegian-t5-base
+git lfs track "*tfevents*"
+```
+
+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
+Next, let's add a symbolic link to the `run_t5_mlm_flax.py` and `t5_tokenizer_model` scripts.
+
+```bash
+export MODEL_DIR="./norwegian-t5-base"
+ln -s ~/transformers/examples/flax/language-modeling/run_t5_mlm_flax.py run_t5_mlm_flax.py
+ln -s ~/transformers/examples/flax/language-modeling/t5_tokenizer_model.py t5_tokenizer_model.py
+```
+
+### Train tokenizer
+
+In the first step, we train a tokenizer to efficiently process the text input for the model. 
+We make use of the [tokenizers](https://github.com/huggingface/tokenizers) library to train 
+a sentencepiece unigram tokenizer as shown in [t5_tokenizer_model.py](https://github.com/huggingface/transformers/tree/master/examples/flax/language-modeling/t5_tokenizer_model.py) 
+which is heavily inspired from [yandex-research/DeDLOC's tokenizer model](https://github.com/yandex-research/DeDLOC/blob/5c994bc64e573702a9a79add3ecd68b38f14b548/sahajbert/tokenizer/tokenizer_model.py) .
+
+The tokenizer is trained on the complete Norwegian dataset of OSCAR
+and consequently saved in `${MODEL_DIR}`
+This can take up to 120 minutes depending on your hardware ☕☕☕ .
+
+```python
+import datasets
+
+from t5_tokenizer_model import SentencePieceUnigramTokenizer
+
+
+vocab_size = 32_000
+input_sentence_size = None
+model_dir = "./norwegian-t5-base"  # ${MODEL_DIR}
+
+# Initialize a dataset
+dataset = datasets.load_dataset("oscar", name="unshuffled_deduplicated_no", split="train")
+
+tokenizer = SentencePieceUnigramTokenizer(unk_token="<unk>", eos_token="</s>", pad_token="<pad>")
+
+
+# Build an iterator over this dataset
+def batch_iterator(input_sentence_size=None):
+    if input_sentence_size is None:
+        input_sentence_size = len(dataset)
+    batch_length = 100
+    for i in range(0, input_sentence_size, batch_length):
+        yield dataset[i: i + batch_length]["text"]
+
+
+# Train tokenizer
+tokenizer.train_from_iterator(
+    iterator=batch_iterator(input_sentence_size=input_sentence_size),
+    vocab_size=vocab_size,
+    show_progress=True,
+)
+
+# Save files to disk
+tokenizer.save(f"{model_dir}/tokenizer.json")
+```
+
+### Create configuration
+
+Next, we create the model's configuration file. This is as simple 
+as loading and storing [`**google/t5-v1_1-base**`](https://huggingface.co/google/t5-v1_1-base)
+in the local model folder:
+
+```python
+from transformers import T5Config
+
+model_dir = "./norwegian-t5-base"  # ${MODEL_DIR}
+
+config = T5Config.from_pretrained("google/t5-v1_1-base", vocab_size=tokenizer.vocab_size)
+config.save_pretrained(model_dir)
+```
+
+### Train model
+
+Next we can run the example script to pretrain the model:
+
+```bash
+./run_t5_mlm_flax.py \
+	--output_dir="./" \
+	--model_type="t5" \
+	--config_name="./" \
+	--tokenizer_name="./" \
+	--dataset_name="oscar" \
+	--dataset_config_name="unshuffled_deduplicated_no" \
+	--max_seq_length="512" \
+	--per_device_train_batch_size="32" \
+	--per_device_eval_batch_size="32" \
+	--adafactor \
+	--learning_rate="0.005" \
+	--weight_decay="0.001" \
+	--warmup_steps="2000" \
+	--overwrite_output_dir \
+	--logging_steps="100" \
+	--save_steps="1000" \
+	--eval_steps="1000" \
+	--push_to_hub
+```
+
+Training should converge at a loss and accuracy 
+of 2.2 and 58.0 respectively after 2 epochs on a single TPUv3-8.
+This should take around 24 hours.
+Training statistics can be accessed on directly on the 🤗 [hub](https://huggingface.co/patrickvonplaten/t5-base-norwegian/tensorboard)

 ## Runtime evaluation

@@ -197,14 +393,9 @@ For reproducibility, we state the training commands used for PyTorch/XLA and PyT
 | Task  | [TPU v3-8 (Flax)](https://tensorboard.dev/experiment/GdYmdak2TWeVz0DDRYOrrg/)  | [TPU v3-8 (Pytorch/XLA)](https://tensorboard.dev/experiment/7Jq1kcQQRAmy12KOdXek7A/)| [8 GPU (PyTorch)](https://tensorboard.dev/experiment/PJneV8FQRxa2unPw1QnVHA)  |
 |-------|-----------|------------|------------|
 | MLM   |  15h32m   |  23h46m    | 44h14m     |
-| **COST*** | $124.24  | $187.84 | $877.92 |

-*All experiments are ran on Google Cloud Platform. Prices are on-demand prices
-(not preemptible), obtained on May 12, 2021 for zone Iowa (us-central1) using
-the following tables:
-[TPU pricing table](https://cloud.google.com/tpu/pricing) ($8.00/h for v3-8),
-[GPU pricing table](https://cloud.google.com/compute/gpus-pricing) ($2.48/h per
-V100 GPU). GPU experiments are ran without further optimizations besides JAX
+*All experiments are ran on Google Cloud Platform. 
+GPU experiments are ran without further optimizations besides JAX
 transformations. GPU experiments are ran with full precision (fp32). "TPU v3-8"
 are 8 TPU cores on 4 chips (each chips has 2 cores), while "8 GPU" are 8 GPU chips.

@@ -281,7 +472,7 @@ mkdir -p ${MODEL_DIR}

 ```bash
 python3 -m torch.distributed.launch --nproc_per_node ${NUM_GPUS} run_mlm.py \
-    --output_dir="./runs" \
+    --output_dir="${MODEL_DIR}" \
    --model_type="roberta" \
    --config_name="${MODEL_DIR}" \
    --tokenizer_name="${MODEL_DIR}" \
--- a/examples/flax/language-modeling/requirements.txt
+++ b/examples/flax/language-modeling/requirements.txt
@@ -2,4 +2,4 @@ datasets >= 1.1.3
 jax>=0.2.8
 jaxlib>=0.1.59
 flax>=0.3.4
-optax>=0.0.8
+optax>=0.0.9
--- a/examples/flax/language-modeling/run_clm_flax.py
+++ b/examples/flax/language-modeling/run_clm_flax.py
@@ -57,22 +57,6 @@ from transformers.testing_utils import CaptureLogger

 logger = logging.getLogger(__name__)

-# Cache the result
-has_tensorboard = is_tensorboard_available()
-if has_tensorboard:
-    try:
-        from flax.metrics.tensorboard import SummaryWriter
-    except ImportError as ie:
-        has_tensorboard = False
-        print(f"Unable to display metrics through TensorBoard because some package are not installed: {ie}")
-
-else:
-    print(
-        "Unable to display metrics through TensorBoard because the package is not installed: "
-        "Please run pip install tensorboard to enable."
-    )
-
-
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_CAUSAL_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

@@ -214,7 +198,7 @@ def data_loader(rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuf
        yield batch


-def write_metric(summary_writer, train_metrics, eval_metrics, train_time, step):
+def write_train_metric(summary_writer, train_metrics, train_time, step):
    summary_writer.scalar("train_time", train_time, step)

    train_metrics = get_metrics(train_metrics)
@@ -223,6 +207,8 @@ def write_metric(summary_writer, train_metrics, eval_metrics, train_time, step):
        for i, val in enumerate(vals):
            summary_writer.scalar(tag, val, step - len(vals) + i + 1)

+
+def write_eval_metric(summary_writer, eval_metrics, step):
    for metric_name, value in eval_metrics.items():
        summary_writer.scalar(f"eval_{metric_name}", value, step)

@@ -267,7 +253,7 @@ def main():

    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
@@ -321,6 +307,20 @@ def main():
        if extension == "txt":
            extension = "text"
        dataset = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -412,7 +412,8 @@ def main():
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
-        total_length = (total_length // block_size) * block_size
+        if total_length >= block_size:
+            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
@@ -450,8 +451,22 @@ def main():
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))

    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
    if has_tensorboard and jax.process_index() == 0:
-        summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir).joinpath("logs").as_posix())
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )

    # Initialize our training
    rng = jax.random.PRNGKey(training_args.seed)
@@ -477,23 +492,36 @@ def main():
    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
    # mask boolean with the same structure as the parameters.
    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxGPT2.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
    def decay_mask_fn(params):
        flat_params = traverse_util.flatten_dict(params)
-        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        flat_mask = {
+            path: (path[-1] != "bias" and path[-2:] not in [("ln_1", "scale"), ("ln_2", "scale"), ("ln_f", "scale")])
+            for path in flat_params
+        }
        return traverse_util.unflatten_dict(flat_mask)

    # create adam optimizer
-    adamw = optax.adamw(
-        learning_rate=linear_decay_lr_schedule_fn,
-        b1=training_args.adam_beta1,
-        b2=training_args.adam_beta2,
-        eps=training_args.adam_epsilon,
-        weight_decay=training_args.weight_decay,
-        mask=decay_mask_fn,
-    )
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=linear_decay_lr_schedule_fn,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=linear_decay_lr_schedule_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            eps=training_args.adam_epsilon,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )

    # Setup train state
-    state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw, dropout_rng=dropout_rng)
+    state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer, dropout_rng=dropout_rng)

    def loss_fn(logits, labels):
        shift_logits = logits[..., :-1, :]
@@ -548,6 +576,7 @@ def main():
    logger.info(f"  Total optimization steps = {total_train_steps}")

    train_time = 0
+    train_metrics = []
    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
    for epoch in epochs:
        # ======================== Training ================================
@@ -555,59 +584,70 @@ def main():

        # Create sampling rng
        rng, input_rng = jax.random.split(rng)
-        train_metrics = []

        # Generate an epoch by shuffling sampling indices from the train dataset
        train_loader = data_loader(input_rng, train_dataset, train_batch_size, shuffle=True)
        steps_per_epoch = len(train_dataset) // train_batch_size
        # train
-        for _ in tqdm(range(steps_per_epoch), desc="Training...", position=1, leave=False):
+        for step in tqdm(range(steps_per_epoch), desc="Training...", position=1, leave=False):
            batch = next(train_loader)
            state, train_metric = p_train_step(state, batch)
            train_metrics.append(train_metric)

-        train_time += time.time() - train_start
+            cur_step = epoch * (len(train_dataset) // train_batch_size) + step

-        train_metric = unreplicate(train_metric)
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)

-        epochs.write(
-            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
-        )
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})"
+                )

-        # ======================== Evaluating ==============================
-        eval_metrics = []
-        eval_loader = data_loader(input_rng, eval_dataset, eval_batch_size)
-        eval_steps = len(eval_dataset) // eval_batch_size
-        for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False):
-            # Model forward
-            batch = next(eval_loader)
-            metrics = p_eval_step(state.params, batch)
-            eval_metrics.append(metrics)
+                train_metrics = []

-        # normalize eval metrics
-        eval_metrics = get_metrics(eval_metrics)
+            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
+                # ======================== Evaluating ==============================
+                eval_metrics = []
+                eval_loader = data_loader(input_rng, eval_dataset, eval_batch_size)
+                eval_steps = len(eval_dataset) // eval_batch_size
+                for _ in tqdm(range(eval_steps), desc="Evaluating...", position=2, leave=False):
+                    # Model forward
+                    batch = next(eval_loader)
+                    metrics = p_eval_step(state.params, batch)
+                    eval_metrics.append(metrics)

-        eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
+                # normalize eval metrics
+                eval_metrics = get_metrics(eval_metrics)
+                eval_metrics = jax.tree_map(jnp.mean, eval_metrics)

-        try:
-            eval_metrics["perplexity"] = math.exp(eval_metrics["loss"])
-        except OverflowError:
-            eval_metrics["perplexity"] = float("inf")
+                try:
+                    eval_metrics["perplexity"] = math.exp(eval_metrics["loss"])
+                except OverflowError:
+                    eval_metrics["perplexity"] = float("inf")

-        # Print metrics and update progress bar
-        desc = f"Epoch... ({epoch + 1}/{num_epochs} | Eval Loss: {eval_metrics['loss']} | Eval Perplexity: {eval_metrics['perplexity']})"
-        epochs.write(desc)
-        epochs.desc = desc
+                # Print metrics and update progress bar
+                desc = f"Step... ({cur_step} | Eval Loss: {eval_metrics['loss']} | Eval Perplexity: {eval_metrics['perplexity']})"
+                epochs.write(desc)
+                epochs.desc = desc

-        # Save metrics
-        if has_tensorboard and jax.process_index() == 0:
-            cur_step = epoch * (len(train_dataset) // train_batch_size)
-            write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step)
+                # Save metrics
+                if has_tensorboard and jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)

-    # save last checkpoint
-    if jax.process_index() == 0:
-        params = jax.device_get(unreplicate(state.params))
-        model.save_pretrained(training_args.output_dir, params=params)
+            if cur_step % training_args.save_steps == 0 and cur_step > 0:
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(unreplicate(state.params))
+                    model.save_pretrained(
+                        training_args.output_dir,
+                        params=params,
+                        push_to_hub=training_args.push_to_hub,
+                        commit_message=f"Saving weights and logs of step {cur_step}",
+                    )


 if __name__ == "__main__":
--- a/examples/flax/language-modeling/run_mlm_flax.py
+++ b/examples/flax/language-modeling/run_mlm_flax.py
@@ -56,22 +56,6 @@ from transformers import (
 )


-# Cache the result
-has_tensorboard = is_tensorboard_available()
-if has_tensorboard:
-    try:
-        from flax.metrics.tensorboard import SummaryWriter
-    except ImportError as ie:
-        has_tensorboard = False
-        print(f"Unable to display metrics through TensorBoard because some package are not installed: {ie}")
-
-else:
-    print(
-        "Unable to display metrics through TensorBoard because the package is not installed: "
-        "Please run pip install tensorboard to enable."
-    )
-
-
 MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
 MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

@@ -269,7 +253,7 @@ def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndar
    return batch_idx


-def write_metric(train_metrics, eval_metrics, train_time, step):
+def write_train_metric(summary_writer, train_metrics, train_time, step):
    summary_writer.scalar("train_time", train_time, step)

    train_metrics = get_metrics(train_metrics)
@@ -278,6 +262,8 @@ def write_metric(train_metrics, eval_metrics, train_time, step):
        for i, val in enumerate(vals):
            summary_writer.scalar(tag, val, step - len(vals) + i + 1)

+
+def write_eval_metric(summary_writer, eval_metrics, step):
    for metric_name, value in eval_metrics.items():
        summary_writer.scalar(f"eval_{metric_name}", value, step)

@@ -308,17 +294,13 @@ if __name__ == "__main__":

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        level="NOTSET",
        datefmt="[%X]",
    )

    # Log on each process the small summary:
    logger = logging.getLogger(__name__)
-    logger.warning(
-        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
-        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
-    )

    # Set the verbosity to info of the Transformers logger (on main process only):
    logger.info(f"Training/evaluation parameters {training_args}")
@@ -362,6 +344,20 @@ if __name__ == "__main__":
        if extension == "txt":
            extension = "text"
        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -449,7 +445,8 @@ if __name__ == "__main__":
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
-            total_length = (total_length // max_seq_length) * max_seq_length
+            if total_length >= max_seq_length:
+                total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
@@ -471,8 +468,22 @@ if __name__ == "__main__":
        )

    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
    if has_tensorboard and jax.process_index() == 0:
-        summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir).joinpath("logs").as_posix())
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )

    # Data collator
    # This one will take care of randomly masking the tokens.
@@ -482,7 +493,14 @@ if __name__ == "__main__":
    rng = jax.random.PRNGKey(training_args.seed)
    dropout_rngs = jax.random.split(rng, jax.local_device_count())

-    model = FlaxAutoModelForMaskedLM.from_config(config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype))
+    if model_args.model_name_or_path:
+        model = FlaxAutoModelForMaskedLM.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    else:
+        model = FlaxAutoModelForMaskedLM.from_config(
+            config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )

    # Store some constant
    num_epochs = int(training_args.num_train_epochs)
@@ -508,23 +526,33 @@ if __name__ == "__main__":
    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
    # mask boolean with the same structure as the parameters.
    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBERT-like models.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
    def decay_mask_fn(params):
        flat_params = traverse_util.flatten_dict(params)
        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
        return traverse_util.unflatten_dict(flat_mask)

    # create adam optimizer
-    adamw = optax.adamw(
-        learning_rate=linear_decay_lr_schedule_fn,
-        b1=training_args.adam_beta1,
-        b2=training_args.adam_beta2,
-        eps=1e-8,
-        weight_decay=training_args.weight_decay,
-        mask=decay_mask_fn,
-    )
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=linear_decay_lr_schedule_fn,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=linear_decay_lr_schedule_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            eps=training_args.adam_epsilon,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )

    # Setup train state
-    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw)
+    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)

    # Define gradient update step fn
    def train_step(state, batch, dropout_rng):
@@ -582,12 +610,12 @@ if __name__ == "__main__":
    # Replicate the train state on each device
    state = jax_utils.replicate(state)

-    train_metrics = []
    train_time = 0
    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
    for epoch in epochs:
        # ======================== Training ================================
        train_start = time.time()
+        train_metrics = []

        # Create sampling rng
        rng, input_rng = jax.random.split(rng)
@@ -598,7 +626,7 @@ if __name__ == "__main__":
        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)

        # Gather the indexes for creating the batch and do a training step
-        for i, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
+        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
            samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
            model_inputs = data_collator(samples, pad_to_multiple_of=16)

@@ -607,44 +635,57 @@ if __name__ == "__main__":
            state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
            train_metrics.append(train_metric)

-        train_time += time.time() - train_start
+            cur_step = epoch * (num_train_samples // train_batch_size) + step

-        epochs.write(
-            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
-        )
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = jax_utils.unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)

-        # ======================== Evaluating ==============================
-        num_eval_samples = len(tokenized_datasets["validation"])
-        eval_samples_idx = jnp.arange(num_eval_samples)
-        eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
+                )

-        eval_metrics = []
-        for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
-            samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
-            model_inputs = data_collator(samples, pad_to_multiple_of=16)
+                train_metrics = []

-            # Model forward
-            model_inputs = shard(model_inputs.data)
-            metrics = p_eval_step(state.params, model_inputs)
-            eval_metrics.append(metrics)
+            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
+                # ======================== Evaluating ==============================
+                num_eval_samples = len(tokenized_datasets["validation"])
+                eval_samples_idx = jnp.arange(num_eval_samples)
+                eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)

-        # normalize eval metrics
-        eval_metrics = get_metrics(eval_metrics)
-        eval_metrics = jax.tree_map(jnp.sum, eval_metrics)
-        eval_normalizer = eval_metrics.pop("normalizer")
-        eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
+                eval_metrics = []
+                for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+                    samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+                    model_inputs = data_collator(samples, pad_to_multiple_of=16)

-        # Update progress bar
-        epochs.desc = (
-            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
-        )
+                    # Model forward
+                    model_inputs = shard(model_inputs.data)
+                    metrics = p_eval_step(state.params, model_inputs)
+                    eval_metrics.append(metrics)

-        # Save metrics
-        if has_tensorboard and jax.process_index() == 0:
-            cur_step = epoch * (len(tokenized_datasets["train"]) // train_batch_size)
-            write_metric(train_metrics, eval_metrics, train_time, cur_step)
+                # normalize eval metrics
+                eval_metrics = get_metrics(eval_metrics)
+                eval_metrics = jax.tree_map(jnp.sum, eval_metrics)
+                eval_normalizer = eval_metrics.pop("normalizer")
+                eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)

-    # save last checkpoint
-    if jax.process_index() == 0:
-        params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-        model.save_pretrained(training_args.output_dir, params=params)
+                # Update progress bar
+                epochs.desc = f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
+
+                # Save metrics
+                if has_tensorboard and jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+
+            if cur_step % training_args.save_steps == 0 and cur_step > 0:
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+                    model.save_pretrained(
+                        training_args.output_dir,
+                        params=params,
+                        push_to_hub=training_args.push_to_hub,
+                        commit_message=f"Saving weights and logs of step {cur_step}",
+                    )
--- a/examples/flax/language-modeling/run_t5_mlm_flax.py
+++ b/examples/flax/language-modeling/run_t5_mlm_flax.py
@@ -0,0 +1,798 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Pretraining the library models for T5-like span-masked language modeling on a text file or a dataset.
+
+Here is the full list of checkpoints on the hub that can be pretrained by this script:
+https://huggingface.co/models?filter=t5
+"""
+# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
+import logging
+import os
+import sys
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import numpy as np
+from datasets import load_dataset
+from tqdm import tqdm
+
+import flax
+import jax
+import jax.numpy as jnp
+import optax
+from flax import jax_utils, traverse_util
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from transformers import (
+    CONFIG_MAPPING,
+    FLAX_MODEL_FOR_MASKED_LM_MAPPING,
+    AutoTokenizer,
+    BatchEncoding,
+    FlaxT5ForConditionalGeneration,
+    HfArgumentParser,
+    PreTrainedTokenizerBase,
+    T5Config,
+    TrainingArguments,
+    is_tensorboard_available,
+    set_seed,
+)
+from transformers.models.t5.modeling_flax_t5 import shift_tokens_right
+
+
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    train_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
+    )
+    validation_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization and masking. Sequences longer than this will be truncated. Default to the max input length of the model."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for span masked language modeling loss"}
+    )
+    mean_noise_span_length: float = field(
+        default=3.0,
+        metadata={"help": "Mean span length of masked tokens"},
+    )
+
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`train_file` should be a csv, a json or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "txt"], "`validation_file` should be a csv, a json or a txt file."
+
+
+def compute_input_and_target_lengths(inputs_length, noise_density, mean_noise_span_length):
+    """This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2466>`__ .
+
+    Training parameters to avoid padding with random_spans_noise_mask.
+    When training a model with random_spans_noise_mask, we would like to set the other
+    training hyperparmeters in a way that avoids padding.
+    This function helps us compute these hyperparameters.
+    We assume that each noise span in the input is replaced by extra_tokens_per_span_inputs sentinel tokens,
+    and each non-noise span in the targets is replaced by extra_tokens_per_span_targets sentinel tokens.
+    This function tells us the required number of tokens in the raw example (for split_tokens())
+    as well as the length of the encoded targets. Note that this function assumes
+    the inputs and targets will have EOS appended and includes that in the reported length.
+
+    Args:
+        inputs_length: an integer - desired length of the tokenized inputs sequence
+        noise_density: a float
+        mean_noise_span_length: a float
+    Returns:
+        tokens_length: length of original text in tokens
+        targets_length: an integer - length in tokens of encoded targets sequence
+    """
+
+    def _tokens_length_to_inputs_length_targets_length(tokens_length):
+        num_noise_tokens = int(round(tokens_length * noise_density))
+        num_nonnoise_tokens = tokens_length - num_noise_tokens
+        num_noise_spans = int(round(num_noise_tokens / mean_noise_span_length))
+        # inputs contain all nonnoise tokens, sentinels for all noise spans
+        # and one EOS token.
+        _input_length = num_nonnoise_tokens + num_noise_spans + 1
+        _output_length = num_noise_tokens + num_noise_spans + 1
+        return _input_length, _output_length
+
+    tokens_length = inputs_length
+
+    while _tokens_length_to_inputs_length_targets_length(tokens_length + 1)[0] <= inputs_length:
+        tokens_length += 1
+
+    inputs_length, targets_length = _tokens_length_to_inputs_length_targets_length(tokens_length)
+
+    # minor hack to get the targets length to be equal to inputs length
+    # which is more likely to have been set to a nice round number.
+    if noise_density == 0.5 and targets_length > inputs_length:
+        tokens_length -= 1
+        targets_length -= 1
+    return tokens_length, targets_length
+
+
+@flax.struct.dataclass
+class FlaxDataCollatorForT5MLM:
+    """
+    Data collator used for T5 span-masked language modeling.
+    It is made sure that after masking the inputs are of length `data_args.max_seq_length` and targets are also of fixed length.
+    For more information on how T5 span-masked language modeling works, one can take a look
+    at the `official paper <https://arxiv.org/pdf/1910.10683.pdf>`__
+    or the `official code for preprocessing <https://github.com/google-research/text-to-text-transfer-transformer/blob/master/t5/data/preprocessors.py>`__ .
+
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        noise_density (:obj:`float`):
+            The probability with which to (randomly) mask tokens in the input.
+        mean_noise_span_length (:obj:`float`):
+            The average span length of the masked tokens.
+        input_length (:obj:`int`):
+            The expected input length after masking.
+        target_length (:obj:`int`):
+            The expected target length after masking.
+        pad_token_id: (:obj:`int`):
+            The pad token id of the model
+        decoder_start_token_id: (:obj:`int):
+            The decoder start token id of the model
+    """
+
+    tokenizer: PreTrainedTokenizerBase
+    noise_density: float
+    mean_noise_span_length: float
+    input_length: int
+    target_length: int
+    pad_token_id: int
+    decoder_start_token_id: int
+
+    def __call__(self, examples: List[Dict[str, np.ndarray]]) -> Dict[str, np.ndarray]:
+
+        # convert list to dict and tensorize input
+        batch = BatchEncoding(
+            {k: np.array([examples[i][k] for i in range(len(examples))]) for k, v in examples[0].items()}
+        )
+
+        input_ids = batch["input_ids"]
+        batch_size, expandend_input_length = input_ids.shape
+
+        mask_indices = np.asarray([self.random_spans_noise_mask(expandend_input_length) for i in range(batch_size)])
+        labels_mask = ~mask_indices
+
+        input_ids_sentinel = self.create_sentinel_ids(mask_indices.astype(np.int8))
+        labels_sentinel = self.create_sentinel_ids(labels_mask.astype(np.int8))
+
+        batch["input_ids"] = self.filter_input_ids(input_ids, input_ids_sentinel)
+        batch["labels"] = self.filter_input_ids(input_ids, labels_sentinel)
+
+        if batch["input_ids"].shape[-1] != self.input_length:
+            raise ValueError(
+                f"`input_ids` are incorrectly preprocessed. `input_ids` length is {batch['input_ids'].shape[-1]}, but should be {self.target_length}."
+            )
+
+        if batch["labels"].shape[-1] != self.target_length:
+            raise ValueError(
+                f"`labels` are incorrectly preprocessed. `labels` length is {batch['labels'].shape[-1]}, but should be {self.target_length}."
+            )
+
+        # to check that tokens are correctly proprocessed, one can run `self.tokenizer.batch_decode(input_ids)` and `self.tokenizer.batch_decode(labels)` here...
+        batch["decoder_input_ids"] = shift_tokens_right(
+            batch["labels"], self.pad_token_id, self.decoder_start_token_id
+        )
+
+        return batch
+
+    def create_sentinel_ids(self, mask_indices):
+        """
+        Sentinel ids creation given the indices that should be masked.
+        The start indices of each mask are replaced by the sentinel ids in increasing
+        order. Consecutive mask indices to be deleted are replaced with `-1`.
+        """
+        start_indices = mask_indices - np.roll(mask_indices, 1, axis=-1) * mask_indices
+        start_indices[:, 0] = mask_indices[:, 0]
+
+        sentinel_ids = np.where(start_indices != 0, np.cumsum(start_indices, axis=-1), start_indices)
+        sentinel_ids = np.where(sentinel_ids != 0, (sentinel_ids + self.tokenizer.vocab_size - 1), 0)
+        sentinel_ids -= mask_indices - start_indices
+
+        return sentinel_ids
+
+    def filter_input_ids(self, input_ids, sentinel_ids):
+        """
+        Puts sentinel mask on `input_ids` and fuse consecutive mask tokens into a single mask token by deleting.
+        This will reduce the sequence length from `expanded_inputs_length` to `input_length`.
+        """
+        batch_size = input_ids.shape[0]
+
+        input_ids_full = np.where(sentinel_ids != 0, sentinel_ids, input_ids)
+        input_ids = input_ids_full[input_ids_full > 0].reshape((batch_size, -1))
+        input_ids = np.concatenate(
+            [input_ids, np.full((batch_size, 1), self.tokenizer.eos_token_id, dtype=np.int32)], axis=-1
+        )
+        return input_ids
+
+    def random_spans_noise_mask(self, length):
+
+        """This function is copy of `random_spans_helper <https://github.com/google-research/text-to-text-transfer-transformer/blob/84f8bcc14b5f2c03de51bd3587609ba8f6bbd1cd/t5/data/preprocessors.py#L2682>`__ .
+
+        Noise mask consisting of random spans of noise tokens.
+        The number of noise tokens and the number of noise spans and non-noise spans
+        are determined deterministically as follows:
+        num_noise_tokens = round(length * noise_density)
+        num_nonnoise_spans = num_noise_spans = round(num_noise_tokens / mean_noise_span_length)
+        Spans alternate between non-noise and noise, beginning with non-noise.
+        Subject to the above restrictions, all masks are equally likely.
+
+        Args:
+            length: an int32 scalar (length of the incoming token sequence)
+            noise_density: a float - approximate density of output mask
+            mean_noise_span_length: a number
+
+        Returns:
+            a boolean tensor with shape [length]
+        """
+
+        orig_length = length
+
+        num_noise_tokens = int(np.round(length * self.noise_density))
+        # avoid degeneracy by ensuring positive numbers of noise and nonnoise tokens.
+        num_noise_tokens = min(max(num_noise_tokens, 1), length - 1)
+        num_noise_spans = int(np.round(num_noise_tokens / self.mean_noise_span_length))
+
+        # avoid degeneracy by ensuring positive number of noise spans
+        num_noise_spans = max(num_noise_spans, 1)
+        num_nonnoise_tokens = length - num_noise_tokens
+
+        # pick the lengths of the noise spans and the non-noise spans
+        def _random_segmentation(num_items, num_segments):
+            """Partition a sequence of items randomly into non-empty segments.
+            Args:
+                num_items: an integer scalar > 0
+                num_segments: an integer scalar in [1, num_items]
+            Returns:
+                a Tensor with shape [num_segments] containing positive integers that add
+                up to num_items
+            """
+            mask_indices = np.arange(num_items - 1) < (num_segments - 1)
+            np.random.shuffle(mask_indices)
+            first_in_segment = np.pad(mask_indices, [[1, 0]])
+            segment_id = np.cumsum(first_in_segment)
+            segment_length = np.asarray(jax.ops.segment_sum(np.ones_like(segment_id), segment_id))
+            return segment_length
+
+        noise_span_lengths = _random_segmentation(num_noise_tokens, num_noise_spans)
+        nonnoise_span_lengths = _random_segmentation(num_nonnoise_tokens, num_noise_spans)
+
+        interleaved_span_lengths = np.reshape(
+            np.stack([nonnoise_span_lengths, noise_span_lengths], axis=1), [num_noise_spans * 2]
+        )
+        span_starts = np.cumsum(interleaved_span_lengths)[:-1]
+        span_start_indicator = np.zeros((length,), dtype=np.int8)
+        span_start_indicator[span_starts] = True
+        span_num = np.cumsum(span_start_indicator)
+        is_noise = np.equal(span_num % 2, 1)
+
+        return is_noise[:orig_length]
+
+
+def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
+    num_samples = len(samples_idx)
+    samples_to_remove = num_samples % batch_size
+
+    if samples_to_remove != 0:
+        samples_idx = samples_idx[:-samples_to_remove]
+    sections_split = num_samples // batch_size
+    batch_idx = np.split(samples_idx, sections_split)
+    return batch_idx
+
+
+def write_train_metric(summary_writer, train_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+
+
+def write_eval_metric(summary_writer, eval_metrics, step):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+
+
+if __name__ == "__main__":
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        level="NOTSET",
+        datefmt="[%X]",
+    )
+
+    # Log on each process the small summary:
+    logger = logging.getLogger(__name__)
+
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    else:
+        data_files = {}
+        if data_args.train_file is not None:
+            data_files["train"] = data_args.train_file
+        if data_args.validation_file is not None:
+            data_files["validation"] = data_args.validation_file
+        extension = data_args.train_file.split(".")[-1]
+        if extension == "txt":
+            extension = "text"
+        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+
+        if "validation" not in datasets.keys():
+            datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
+    # https://huggingface.co/docs/datasets/loading_datasets.html.
+
+    # Load pretrained model and tokenizer
+
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+
+    if model_args.config_name:
+        config = T5Config.from_pretrained(
+            model_args.config_name, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer)
+        )
+    elif model_args.model_name_or_path:
+        config = T5Config.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir, vocab_size=len(tokenizer)
+        )
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    # Preprocessing the datasets.
+    # First we tokenize all the texts.
+    if training_args.do_train:
+        column_names = datasets["train"].column_names
+    else:
+        column_names = datasets["validation"].column_names
+    text_column_name = "text" if "text" in column_names else column_names[0]
+
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+
+    # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+    # Since we make sure that all sequences are of the same length, no attention_mask is needed.
+    def tokenize_function(examples):
+        return tokenizer(examples[text_column_name], return_attention_mask=False)
+
+    tokenized_datasets = datasets.map(
+        tokenize_function,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        remove_columns=column_names,
+        load_from_cache_file=not data_args.overwrite_cache,
+    )
+
+    # T5-like span masked language modeling will fuse consecutively masked tokens to a single sentinel token.
+    # To ensure that the input length is `max_seq_length`, we need to increase the maximum length
+    # according to `mlm_probability` and `mean_noise_span_length`. We can also define the label length accordingly.
+    expanded_inputs_length, targets_length = compute_input_and_target_lengths(
+        inputs_length=max_seq_length,
+        noise_density=data_args.mlm_probability,
+        mean_noise_span_length=data_args.mean_noise_span_length,
+    )
+
+    # Main data processing function that will concatenate all texts from our dataset and generate chunks of expanded_inputs_length.
+    def group_texts(examples):
+        # Concatenate all texts.
+        concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
+        total_length = len(concatenated_examples[list(examples.keys())[0]])
+        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
+        # customize this part to your needs.
+        if total_length >= expanded_inputs_length:
+            total_length = (total_length // expanded_inputs_length) * expanded_inputs_length
+        # Split by chunks of max_len.
+        result = {
+            k: [t[i : i + expanded_inputs_length] for i in range(0, total_length, expanded_inputs_length)]
+            for k, t in concatenated_examples.items()
+        }
+        return result
+
+    # Note that with `batched=True`, this map processes 1,000 texts together, so group_texts throws away a
+    # remainder for each of those groups of 1,000 texts. You can adjust that batch_size here but a higher value
+    # might be slower to preprocess.
+    #
+    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
+    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map
+    tokenized_datasets = tokenized_datasets.map(
+        group_texts,
+        batched=True,
+        num_proc=data_args.preprocessing_num_workers,
+        load_from_cache_file=not data_args.overwrite_cache,
+    )
+
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
+
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    dropout_rngs = jax.random.split(rng, jax.local_device_count())
+
+    if model_args.model_name_or_path:
+        model = FlaxT5ForConditionalGeneration.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    else:
+        model = FlaxT5ForConditionalGeneration(config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype))
+
+    # Data collator
+    # This one will take care of randomly masking the tokens.
+    data_collator = FlaxDataCollatorForT5MLM(
+        tokenizer=tokenizer,
+        noise_density=data_args.mlm_probability,
+        mean_noise_span_length=data_args.mean_noise_span_length,
+        input_length=max_seq_length,
+        target_length=targets_length,
+        pad_token_id=model.config.pad_token_id,
+        decoder_start_token_id=model.config.decoder_start_token_id,
+    )
+
+    # Store some constant
+    num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+
+    num_train_steps = len(tokenized_datasets["train"]) // train_batch_size * num_epochs
+
+    # Create learning rate schedule
+    warmup_fn = optax.linear_schedule(
+        init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
+    )
+    decay_fn = optax.linear_schedule(
+        init_value=training_args.learning_rate,
+        end_value=0,
+        transition_steps=num_train_steps - training_args.warmup_steps,
+    )
+    linear_decay_lr_schedule_fn = optax.join_schedules(
+        schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
+    )
+
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {
+            path: (path[-1] != "bias" and path[-2:] not in [("layer_norm", "scale"), ("final_layer_norm", "scale")])
+            for path in flat_params
+        }
+        return traverse_util.unflatten_dict(flat_mask)
+
+    # create adam optimizer
+    if training_args.adafactor:
+        # We use the default parameters here to initialize adafactor,
+        # For more details about the parameters please check https://github.com/deepmind/optax/blob/ed02befef9bf81cbbf236be3d2b0e032e9ed4a40/optax/_src/alias.py#L74
+        optimizer = optax.adafactor(
+            learning_rate=linear_decay_lr_schedule_fn,
+        )
+    else:
+        optimizer = optax.adamw(
+            learning_rate=linear_decay_lr_schedule_fn,
+            b1=training_args.adam_beta1,
+            b2=training_args.adam_beta2,
+            weight_decay=training_args.weight_decay,
+            mask=decay_mask_fn,
+        )
+
+    # Setup train state
+    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=optimizer)
+
+    # Define gradient update step fn
+    def train_step(state, batch, dropout_rng):
+        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
+
+        def loss_fn(params):
+            labels = batch.pop("labels")
+
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+
+            # compute loss
+            loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])).mean()
+
+            return loss
+
+        grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+
+        metrics = jax.lax.pmean(
+            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
+        )
+
+        return new_state, metrics, new_dropout_rng
+
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+
+    # Define eval fn
+    def eval_step(params, batch):
+        labels = batch.pop("labels")
+
+        logits = model(**batch, params=params, train=False)[0]
+
+        # compute loss
+        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1]))
+
+        # compute accuracy
+        accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels)
+
+        # summarize metrics
+        metrics = {"loss": loss.mean(), "accuracy": accuracy.mean()}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+
+        return metrics
+
+    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
+
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+
+    train_time = 0
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+        train_start = time.time()
+        train_metrics = []
+
+        # Create sampling rng
+        rng, input_rng = jax.random.split(rng)
+
+        # Generate an epoch by shuffling sampling indices from the train dataset
+        num_train_samples = len(tokenized_datasets["train"])
+        train_samples_idx = jax.random.permutation(input_rng, jnp.arange(num_train_samples))
+        train_batch_idx = generate_batch_splits(train_samples_idx, train_batch_size)
+
+        # Gather the indexes for creating the batch and do a training step
+        for step, batch_idx in enumerate(tqdm(train_batch_idx, desc="Training...", position=1)):
+            samples = [tokenized_datasets["train"][int(idx)] for idx in batch_idx]
+            model_inputs = data_collator(samples)
+
+            # Model forward
+            model_inputs = shard(model_inputs.data)
+            state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+            train_metrics.append(train_metric)
+
+            cur_step = epoch * (num_train_samples // train_batch_size) + step
+
+            if cur_step % training_args.logging_steps == 0 and cur_step > 0:
+                # Save metrics
+                train_metric = jax_utils.unreplicate(train_metric)
+                train_time += time.time() - train_start
+                if has_tensorboard and jax.process_index() == 0:
+                    write_train_metric(summary_writer, train_metrics, train_time, cur_step)
+
+                epochs.write(
+                    f"Step... ({cur_step} | Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})"
+                )
+
+                train_metrics = []
+
+            if cur_step % training_args.eval_steps == 0 and cur_step > 0:
+                # ======================== Evaluating ==============================
+                num_eval_samples = len(tokenized_datasets["validation"])
+                eval_samples_idx = jnp.arange(num_eval_samples)
+                eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+
+                eval_metrics = []
+                for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=2)):
+                    samples = [tokenized_datasets["validation"][int(idx)] for idx in batch_idx]
+                    model_inputs = data_collator(samples)
+
+                    # Model forward
+                    model_inputs = shard(model_inputs.data)
+                    metrics = p_eval_step(state.params, model_inputs)
+                    eval_metrics.append(metrics)
+
+                # get eval metrics
+                eval_metrics = get_metrics(eval_metrics)
+                eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
+
+                # Update progress bar
+                epochs.write(f"Step... ({cur_step} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})")
+
+                # Save metrics
+                if has_tensorboard and jax.process_index() == 0:
+                    write_eval_metric(summary_writer, eval_metrics, cur_step)
+
+            if cur_step % training_args.save_steps == 0 and cur_step > 0:
+                # save checkpoint after each epoch and push checkpoint to the hub
+                if jax.process_index() == 0:
+                    params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+                    model.save_pretrained(
+                        training_args.output_dir,
+                        params=params,
+                        push_to_hub=training_args.push_to_hub,
+                        commit_message=f"Saving weights and logs of step {cur_step}",
+                    )
--- a/examples/flax/language-modeling/t5_tokenizer_model.py
+++ b/examples/flax/language-modeling/t5_tokenizer_model.py
@@ -0,0 +1,112 @@
+#!/usr/bin/env python3
+import json
+from typing import Iterator, List, Union
+
+from tokenizers import AddedToken, Regex, Tokenizer, decoders, normalizers, pre_tokenizers, trainers
+from tokenizers.implementations.base_tokenizer import BaseTokenizer
+from tokenizers.models import Unigram
+from tokenizers.processors import TemplateProcessing
+
+
+class SentencePieceUnigramTokenizer(BaseTokenizer):
+    """
+    This class is a copy of `DeDLOC's tokenizer implementation <https://github.com/yandex-research/DeDLOC/blob/main/sahajbert/tokenizer/tokenizer_model.py>`__ .
+
+    Custom SentencePiece Unigram Tokenizer with NMT, NKFC, spaces and lower-casing characters normalization
+    Represents the Unigram algorithm, with the pretokenization used by SentencePiece
+    """
+
+    def __init__(
+        self,
+        replacement: str = "▁",
+        add_prefix_space: bool = True,
+        unk_token: Union[str, AddedToken] = "<unk>",
+        eos_token: Union[str, AddedToken] = "</s>",
+        pad_token: Union[str, AddedToken] = "<pad>",
+    ):
+        self.special_tokens = {
+            "pad": {"id": 0, "token": pad_token},
+            "eos": {"id": 1, "token": eos_token},
+            "unk": {"id": 2, "token": unk_token},
+        }
+
+        self.special_tokens_list = [None] * len(self.special_tokens)
+        for token_dict in self.special_tokens.values():
+            self.special_tokens_list[token_dict["id"]] = token_dict["token"]
+
+        tokenizer = Tokenizer(Unigram())
+
+        tokenizer.normalizer = normalizers.Sequence(
+            [
+                normalizers.Nmt(),
+                normalizers.NFKC(),
+                normalizers.Replace(Regex(" {2,}"), " "),
+                normalizers.Lowercase(),
+            ]
+        )
+        tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
+            [
+                pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space),
+                pre_tokenizers.Digits(individual_digits=True),
+                pre_tokenizers.Punctuation(),
+            ]
+        )
+        tokenizer.decoder = decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
+
+        tokenizer.post_processor = TemplateProcessing(
+            single=f"$A {self.special_tokens['eos']['token']}",
+            special_tokens=[(self.special_tokens["eos"]["token"], self.special_tokens["eos"]["id"])],
+        )
+
+        parameters = {
+            "model": "SentencePieceUnigram",
+            "replacement": replacement,
+            "add_prefix_space": add_prefix_space,
+        }
+
+        super().__init__(tokenizer, parameters)
+
+    def train(
+        self,
+        files: Union[str, List[str]],
+        vocab_size: int = 8000,
+        show_progress: bool = True,
+    ):
+        """Train the model using the given files"""
+
+        trainer = trainers.UnigramTrainer(
+            vocab_size=vocab_size,
+            special_tokens=self.special_tokens_list,
+            show_progress=show_progress,
+        )
+
+        if isinstance(files, str):
+            files = [files]
+        self._tokenizer.train(files, trainer=trainer)
+
+        self.add_unk_id()
+
+    def train_from_iterator(
+        self,
+        iterator: Union[Iterator[str], Iterator[Iterator[str]]],
+        vocab_size: int = 8000,
+        show_progress: bool = True,
+    ):
+        """Train the model using the given iterator"""
+
+        trainer = trainers.UnigramTrainer(
+            vocab_size=vocab_size,
+            special_tokens=self.special_tokens_list,
+            show_progress=show_progress,
+        )
+
+        self._tokenizer.train_from_iterator(iterator, trainer=trainer)
+
+        self.add_unk_id()
+
+    def add_unk_id(self):
+        tokenizer_json = json.loads(self._tokenizer.to_str())
+
+        tokenizer_json["model"]["unk_id"] = self.special_tokens["unk"]["id"]
+
+        self._tokenizer = Tokenizer.from_str(json.dumps(tokenizer_json))
--- a/examples/flax/summarization/README.md
+++ b/examples/flax/summarization/README.md
@@ -0,0 +1,66 @@
+# Summarization (Seq2Seq model) training examples
+
+The following example showcases how to finetune a sequence-to-sequence model for summarization
+using the JAX/Flax backend.
+
+JAX/Flax allows you to trace pure functions and compile them into efficient, fused accelerator code on both GPU and TPU.
+Models written in JAX/Flax are **immutable** and updated in a purely functional
+way which enables simple and efficient model parallelism.
+
+`run_summarization_flax.py` is a lightweight example of how to download and preprocess a dataset from the 🤗 Datasets library or use your own files (jsonlines or csv), then fine-tune one of the architectures above on it.
+
+For custom datasets in `jsonlines` format please see: https://huggingface.co/docs/datasets/loading_datasets.html#json-files and you also will find examples of these below.
+
+Let's start by creating a model repository to save the trained model and logs.
+Here we call the model `"bart-base-xsum"`, but you can change the model name as you like.
+
+You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
+you are logged in) or via the command line:
+
+```
+huggingface-cli repo create bart-base-xsum
+```
+Next we clone the model repository to add the tokenizer and model files.
+```
+git clone https://huggingface.co/<your-username>/bart-base-xsum
+```
+To ensure that all tensorboard traces will be uploaded correctly, we need to 
+track them. You can run the following command inside your model repo to do so.
+
+```
+cd bart-base-xsum
+git lfs track "*tfevents*"
+```
+
+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
+Next, let's add a symbolic link to the `run_summarization_flax.py`.
+
+```bash
+export MODEL_DIR="./bart-base-xsum"
+ln -s ~/transformers/examples/flax/summarization/run_summarization_flax.py run_summarization_flax.py
+```
+
+### Train the model
+Next we can run the example script to train the model:
+
+```bash
+python run_summarization_flax.py \
+	--output_dir ${MODEL_DIR} \
+	--model_name_or_path facebook/bart-base \
+	--tokenizer_name facebook/bart-base \
+	--dataset_name="xsum" \
+	--do_train --do_eval --do_predict --predict_with_generate \
+	--num_train_epochs 6 \
+	--learning_rate 5e-5 --warmup_steps 0 \
+	--per_device_train_batch_size 64 \
+	--per_device_eval_batch_size 64 \
+	--overwrite_output_dir \
+	--max_source_length 512 --max_target_length 64 \
+	--push_to_hub
+```
+
+This should finish in 37min, with validation loss and ROUGE2 score of 1.7785 and 17.01 respectively after 6 epochs. training statistics can be accessed on [tfhub.de](https://tensorboard.dev/experiment/OcPfOIgXRMSJqYB4RdK2tA/#scalars).
+
+> Note that here we used default `generate` arguments, using arguments specific for `xsum` dataset should give better ROUGE scores.  
--- a/examples/flax/summarization/requirements.txt
+++ b/examples/flax/summarization/requirements.txt
@@ -0,0 +1,5 @@
+datasets >= 1.1.3
+jax>=0.2.8
+jaxlib>=0.1.59
+flax>=0.3.4
+optax>=0.0.8
--- a/examples/flax/summarization/run_summarization_flax.py
+++ b/examples/flax/summarization/run_summarization_flax.py
@@ -135,6 +135,10 @@ class DataTrainingArguments:
        default=None,
        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
    )
+    test_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input predict data file to do prediction on (a text file)."},
+    )
    max_source_length: Optional[int] = field(
        default=1024,
        metadata={
@@ -313,7 +317,7 @@ def main():

    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
@@ -542,7 +546,7 @@ def main():
        try:
            from flax.metrics.tensorboard import SummaryWriter

-            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir).joinpath("logs").as_posix())
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
        except ImportError as ie:
            has_tensorboard = False
            logger.warning(
@@ -578,9 +582,15 @@ def main():
    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
    # mask boolean with the same structure as the parameters.
    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBart.
+    # For FlaxT5, one should correct the layer norm parameter naming
+    # accordingly - see `run_t5_mlm_flax.py` e.g.
    def decay_mask_fn(params):
        flat_params = traverse_util.flatten_dict(params)
-        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        layer_norm_params = [
+            (name, "scale") for name in ["self_attn_layer_norm", "layernorm_embedding", "final_layer_norm"]
+        ]
+        flat_mask = {path: (path[-1] != "bias" and path[-2:] not in layer_norm_params) for path in flat_params}
        return traverse_util.unflatten_dict(flat_mask)

    # create adam optimizer
@@ -787,10 +797,15 @@ def main():
        desc = f"Predict Loss: {pred_metrics['loss']} | {rouge_desc})"
        logger.info(desc)

-    # save last checkpoint
-    if jax.process_index() == 0:
-        params = jax.device_get(unreplicate(state.params))
-        model.save_pretrained(training_args.output_dir, params=params)
+        # save checkpoint after each epoch and push checkpoint to the hub
+        if jax.process_index() == 0:
+            params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+            model.save_pretrained(
+                training_args.output_dir,
+                params=params,
+                push_to_hub=training_args.push_to_hub,
+                commit_message=f"Saving weights and logs of epoch {epoch+1}",
+            )


 if __name__ == "__main__":
--- a/examples/flax/text-classification/README.md
+++ b/examples/flax/text-classification/README.md
@@ -23,31 +23,68 @@ Based on the script [`run_flax_glue.py`](https://github.com/huggingface/transfor
 Fine-tuning the library models for sequence classification on the GLUE benchmark: [General Language Understanding
 Evaluation](https://gluebenchmark.com/). This script can fine-tune any of the models on the [hub](https://huggingface.co/models).

-GLUE is made up of a total of 9 different tasks. Here is how to run the script on one of them:
+To begin with it is recommended to create a model repository to save the trained model and logs.
+Here we call the model `"bert-glue-mrpc-test"`, but you can change the model name as you like.
+
+You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
+you are logged in) or via the command line:
+
+```
+huggingface-cli repo create bert-glue-mrpc-test
+```
+
+Next we clone the model repository to add the tokenizer and model files.
+
+```
+git clone https://huggingface.co/<your-username>/bert-glue-mrpc-test
+```
+
+To ensure that all tensorboard traces will be uploaded correctly, we need to 
+track them. You can run the following command inside your model repo to do so.
+
+```
+cd bert-glue-mrpc-test
+git lfs track "*tfevents*"
+```
+
+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
+Next, let's add a symbolic link to the `run_flax_glue.py`.

 ```bash
 export TASK_NAME=mrpc
+export MODEL_DIR="./bert-glue-mrpc-test"
+ln -s ~/transformers/examples/flax/text-classification/run_flax_glue.py run_flax_glue.py
+```

+
+GLUE is made up of a total of 9 different tasks. Here is how to run the script on one of them:
+
+```bash
 python run_flax_glue.py \
  --model_name_or_path bert-base-cased \
-  --task_name $TASK_NAME \
+  --task_name ${TASK_NAME} \
  --max_length 128 \
  --learning_rate 2e-5 \
  --num_train_epochs 3 \
  --per_device_train_batch_size 4 \
-  --output_dir /tmp/$TASK_NAME/
+  --output_dir ${MODEL_DIR} \
+  --push_to_hub
 ```

 where task name can be one of cola, mnli, mnli-mm, mrpc, qnli, qqp, rte, sst2, stsb, wnli.

 Using the command above, the script will train for 3 epochs and run eval after each epoch. 
-Metrics and hyperparameters are stored in Tensorflow event files in `---output_dir`.
+Metrics and hyperparameters are stored in Tensorflow event files in `--output_dir`.
 You can see the results by running `tensorboard` in that directory:

 ```bash
 $ tensorboard --logdir .
 ```

+or directly on the hub under *Training metrics*.
+
 ### Accuracy Evaluation

 We train five replicas and report mean accuracy and stdev on the dev set below.
@@ -95,14 +132,8 @@ overall training time below. For comparison we ran Pytorch's [run_glue.py](https
 | WNLI  |  1m 11s   |     48s    | 39s        | 36s             |
 |-------|
 | **TOTAL** | 1h 03m | 1h 28m | 5h 16m | 6h 37m      |
-| **COST*** | $8.56  | $29.10 | $13.06 | $16.41      |

-
-*All experiments are ran on Google Cloud Platform. Prices are on-demand prices
-(not preemptible), obtained on May 12, 2021 for zone Iowa (us-central1) using
-the following tables:
-[TPU pricing table](https://cloud.google.com/tpu/pricing) ($8.00/h for v3-8),
-[GPU pricing table](https://cloud.google.com/compute/gpus-pricing) ($2.48/h per
-V100 GPU). GPU experiments are ran without further optimizations besides JAX
+*All experiments are ran on Google Cloud Platform. 
+GPU experiments are ran without further optimizations besides JAX
 transformations. GPU experiments are ran with full precision (fp32). "TPU v3-8"
 are 8 TPU cores on 4 chips (each chips has 2 cores), while "8 GPU" are 8 GPU chips.
--- a/examples/flax/text-classification/run_flax_glue.py
+++ b/examples/flax/text-classification/run_flax_glue.py
@@ -123,6 +123,11 @@ def parse_args():
    )
    parser.add_argument("--output_dir", type=str, default=None, help="Where to store the final model.")
    parser.add_argument("--seed", type=int, default=3, help="A seed for reproducible training.")
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="If passed, model checkpoints and tensorboard logs will be pushed to the hub",
+    )
    args = parser.parse_args()

    # Sanity checks
@@ -249,7 +254,7 @@ def main():

    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
@@ -491,10 +496,15 @@ def main():
        cur_step = epoch * (len(train_dataset) // train_batch_size)
        write_metric(train_metrics, eval_metric, train_time, cur_step)

-    # save last checkpoint
-    if jax.process_index() == 0:
-        params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
-        model.save_pretrained(args.output_dir, params=params)
+        # save checkpoint after each epoch and push checkpoint to the hub
+        if jax.process_index() == 0:
+            params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+            model.save_pretrained(
+                args.output_dir,
+                params=params,
+                push_to_hub=args.push_to_hub,
+                commit_message=f"Saving weights and logs of epoch {epoch}",
+            )


 if __name__ == "__main__":
--- a/examples/flax/vision/README.md
+++ b/examples/flax/vision/README.md
@@ -0,0 +1,101 @@
+<!---
+Copyright 2021 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Image Classification training examples
+
+The following example showcases how to train/fine-tune `ViT` for image-classification using the JAX/Flax backend.
+
+JAX/Flax allows you to trace pure functions and compile them into efficient, fused accelerator code on both GPU and TPU.
+Models written in JAX/Flax are **immutable** and updated in a purely functional
+way which enables simple and efficient model parallelism.
+
+
+In this example we will train/fine-tune the model on the [imagenette](https://github.com/fastai/imagenette) dataset.
+
+Let's start by creating a model repository to save the trained model and logs.
+Here we call the model `"vit-base-patch16-imagenette"`, but you can change the model name as you like.
+
+You can do this either directly on [huggingface.co](https://huggingface.co/new) (assuming that
+you are logged in) or via the command line:
+
+```
+huggingface-cli repo create vit-base-patch16-imagenette
+```
+Next we clone the model repository to add the tokenizer and model files.
+```
+git clone https://huggingface.co/<your-username>/vit-base-patch16-imagenette
+```
+To ensure that all tensorboard traces will be uploaded correctly, we need to 
+track them. You can run the following command inside your model repo to do so.
+
+```
+cd vit-base-patch16-imagenette
+git lfs track "*tfevents*"
+```
+
+Great, we have set up our model repository. During training, we will automatically
+push the training logs and model weights to the repo.
+
+Next, let's add a symbolic link to the `run_image_classification_flax.py`.
+
+```bash
+export MODEL_DIR="./vit-base-patch16-imagenette
+ln -s ~/transformers/examples/flax/summarization/run_image_classification_flax.py run_image_classification_flax.py
+```
+
+## Prepare the dataset
+
+We will use the [imagenette](https://github.com/fastai/imagenette) dataset to train/fine-tune our model. Imagenette is a subset of 10 easily classified classes from Imagenet (tench, English springer, cassette player, chain saw, church, French horn, garbage truck, gas pump, golf ball, parachute).
+
+
+### Download and extract the data.
+
+```bash
+wget https://s3.amazonaws.com/fast-ai-imageclas/imagenette2.tgz
+tar -xvzf imagenette2.tgz
+```
+
+This will create a `imagenette2` dir with two subdirectories `train` and `val` each with multiple subdirectories per class. The training script expects the following directory structure
+
+```bash
+root/dog/xxx.png
+root/dog/xxy.png
+root/dog/[...]/xxz.png
+
+root/cat/123.png
+root/cat/nsdf3.png
+root/cat/[...]/asd932_.png
+```
+
+## Train the model
+
+Next we can run the example script to fine-tune the model:
+
+```bash
+python run_image_classification.py \
+    --output_dir ${MODEL_DIR} \
+    --model_name_or_path google/vit-base-patch16-224-in21k \
+    --train_dir="imagenette2/train" \
+    --validation_dir="imagenette2/val" \
+    --num_train_epochs 5 \
+    --learning_rate 1e-3 \
+    --per_device_train_batch_size 128 --per_device_eval_batch_size 128 \
+    --overwrite_output_dir \
+    --preprocessing_num_workers 32 \
+    --push_to_hub
+```
+
+This should finish in ~7mins with 99% validation accuracy.
--- a/examples/flax/vision/requirements.txt
+++ b/examples/flax/vision/requirements.txt
@@ -0,0 +1,8 @@
+jax>=0.2.8
+jaxlib>=0.1.59
+flax>=0.3.4
+optax>=0.0.8
+-f https://download.pytorch.org/whl/torch_stable.html
+torch==1.9.0+cpu 
+-f https://download.pytorch.org/whl/torch_stable.html
+torchvision==0.10.0+cpu
--- a/examples/flax/vision/run_image_classification.py
+++ b/examples/flax/vision/run_image_classification.py
@@ -0,0 +1,467 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Pre-training/Fine-tuning ViT for image classification .
+
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=vit
+"""
+
+import logging
+import os
+import sys
+import time
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Callable, Optional
+
+# for dataset and preprocessing
+import torch
+import torchvision
+import torchvision.transforms as transforms
+from tqdm import tqdm
+
+import jax
+import jax.numpy as jnp
+import optax
+import transformers
+from flax import jax_utils
+from flax.jax_utils import unreplicate
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard, shard_prng_key
+from transformers import (
+    CONFIG_MAPPING,
+    FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
+    AutoConfig,
+    FlaxAutoModelForImageClassification,
+    HfArgumentParser,
+    TrainingArguments,
+    is_tensorboard_available,
+    set_seed,
+)
+
+
+logger = logging.getLogger(__name__)
+
+
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    train_dir: str = field(
+        metadata={"help": "Path to the root training directory which contains one subdirectory per class."}
+    )
+    validation_dir: str = field(
+        metadata={"help": "Path to the root validation directory which contains one subdirectory per class."},
+    )
+    image_size: Optional[int] = field(default=224, metadata={"help": " The size (resolution) of each image."})
+    max_train_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of training examples to this "
+            "value if set."
+        },
+    )
+    max_eval_samples: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "For debugging purposes or quicker training, truncate the number of evaluation examples to this "
+            "value if set."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+
+
+class TrainState(train_state.TrainState):
+    dropout_rng: jnp.ndarray
+
+    def replicate(self):
+        return jax_utils.replicate(self).replace(dropout_rng=shard_prng_key(self.dropout_rng))
+
+
+def write_metric(summary_writer, train_metrics, eval_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+
+
+def create_learning_rate_fn(
+    train_ds_size: int, train_batch_size: int, num_train_epochs: int, num_warmup_steps: int, learning_rate: float
+) -> Callable[[int], jnp.array]:
+    """Returns a linear warmup, linear_decay learning rate function."""
+    steps_per_epoch = train_ds_size // train_batch_size
+    num_train_steps = steps_per_epoch * num_train_epochs
+    warmup_fn = optax.linear_schedule(init_value=0.0, end_value=learning_rate, transition_steps=num_warmup_steps)
+    decay_fn = optax.linear_schedule(
+        init_value=learning_rate, end_value=0, transition_steps=num_train_steps - num_warmup_steps
+    )
+    schedule_fn = optax.join_schedules(schedules=[warmup_fn, decay_fn], boundaries=[num_warmup_steps])
+    return schedule_fn
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+
+    # Make one log on every process with the configuration for debugging.
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    # Setup logging, we only want one process per machine to log things on the screen.
+    logger.setLevel(logging.INFO if jax.process_index() == 0 else logging.ERROR)
+    if jax.process_index() == 0:
+        transformers.utils.logging.set_verbosity_info()
+    else:
+        transformers.utils.logging.set_verbosity_error()
+
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+
+    # set seed for random transforms and torch dataloaders
+    set_seed(training_args.seed)
+
+    # Initialize datasets and pre-processing transforms
+    # We use torchvision here for faster pre-processing
+    # Note that here we are using some default pre-processing, for maximum accuray
+    # one should tune this part and carefully select what transformations to use.
+    normalize = transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5])
+    train_dataset = torchvision.datasets.ImageFolder(
+        data_args.train_dir,
+        transforms.Compose(
+            [
+                transforms.RandomResizedCrop(data_args.image_size),
+                transforms.RandomHorizontalFlip(),
+                transforms.ToTensor(),
+                normalize,
+            ]
+        ),
+    )
+
+    eval_dataset = torchvision.datasets.ImageFolder(
+        data_args.validation_dir,
+        transforms.Compose(
+            [
+                transforms.Resize(data_args.image_size),
+                transforms.CenterCrop(data_args.image_size),
+                transforms.ToTensor(),
+                normalize,
+            ]
+        ),
+    )
+
+    # Load pretrained model and tokenizer
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(
+            model_args.config_name,
+            num_labels=len(train_dataset.classes),
+            image_size=data_args.image_size,
+            cache_dir=model_args.cache_dir,
+        )
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(
+            model_args.model_name_or_path,
+            num_labels=len(train_dataset.classes),
+            image_size=data_args.image_size,
+            cache_dir=model_args.cache_dir,
+        )
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+
+    if model_args.model_name_or_path:
+        model = FlaxAutoModelForImageClassification.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    else:
+        model = FlaxAutoModelForImageClassification.from_config(
+            config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+
+    # Store some constant
+    num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    steps_per_epoch = len(train_dataset) // train_batch_size
+    total_train_steps = steps_per_epoch * num_epochs
+
+    def collate_fn(examples):
+        pixel_values = torch.stack([example[0] for example in examples])
+        labels = torch.tensor([example[1] for example in examples])
+
+        batch = {"pixel_values": pixel_values, "labels": labels}
+        batch = {k: v.numpy() for k, v in batch.items()}
+
+        return batch
+
+    # Create data loaders
+    train_loader = torch.utils.data.DataLoader(
+        train_dataset,
+        batch_size=train_batch_size,
+        shuffle=True,
+        num_workers=data_args.preprocessing_num_workers,
+        persistent_workers=True,
+        drop_last=True,
+        collate_fn=collate_fn,
+    )
+
+    eval_loader = torch.utils.data.DataLoader(
+        eval_dataset,
+        batch_size=eval_batch_size,
+        shuffle=False,
+        num_workers=data_args.preprocessing_num_workers,
+        persistent_workers=True,
+        drop_last=True,
+        collate_fn=collate_fn,
+    )
+
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
+
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    rng, dropout_rng = jax.random.split(rng)
+
+    # Create learning rate schedule
+    linear_decay_lr_schedule_fn = create_learning_rate_fn(
+        len(train_dataset),
+        train_batch_size,
+        training_args.num_train_epochs,
+        training_args.warmup_steps,
+        training_args.learning_rate,
+    )
+
+    # create adam optimizer
+    adamw = optax.adamw(
+        learning_rate=linear_decay_lr_schedule_fn,
+        b1=training_args.adam_beta1,
+        b2=training_args.adam_beta2,
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+    )
+
+    # Setup train state
+    state = TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw, dropout_rng=dropout_rng)
+
+    def loss_fn(logits, labels):
+        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1]))
+        return loss.mean()
+
+    # Define gradient update step fn
+    def train_step(state, batch):
+        dropout_rng, new_dropout_rng = jax.random.split(state.dropout_rng)
+
+        def compute_loss(params):
+            labels = batch.pop("labels")
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+            loss = loss_fn(logits, labels)
+            return loss
+
+        grad_fn = jax.value_and_grad(compute_loss)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+
+        new_state = state.apply_gradients(grads=grad, dropout_rng=new_dropout_rng)
+
+        metrics = {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+
+        return new_state, metrics
+
+    # Define eval fn
+    def eval_step(params, batch):
+        labels = batch.pop("labels")
+        logits = model(**batch, params=params, train=False)[0]
+        loss = loss_fn(logits, labels)
+
+        # summarize metrics
+        accuracy = (jnp.argmax(logits, axis=-1) == labels).mean()
+        metrics = {"loss": loss, "accuracy": accuracy}
+        metrics = jax.lax.pmean(metrics, axis_name="batch")
+        return metrics
+
+    # Create parallel version of the train and eval step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+    p_eval_step = jax.pmap(eval_step, "batch")
+
+    # Replicate the train state on each device
+    state = state.replicate()
+
+    logger.info("***** Running training *****")
+    logger.info(f"  Num examples = {len(train_dataset)}")
+    logger.info(f"  Num Epochs = {num_epochs}")
+    logger.info(f"  Instantaneous batch size per device = {training_args.per_device_train_batch_size}")
+    logger.info(f"  Total train batch size (w. parallel & distributed) = {train_batch_size}")
+    logger.info(f"  Total optimization steps = {total_train_steps}")
+
+    train_time = 0
+    epochs = tqdm(range(num_epochs), desc=f"Epoch ... (1/{num_epochs})", position=0)
+    for epoch in epochs:
+        # ======================== Training ================================
+        train_start = time.time()
+
+        # Create sampling rng
+        rng, input_rng = jax.random.split(rng)
+        train_metrics = []
+
+        steps_per_epoch = len(train_dataset) // train_batch_size
+        train_step_progress_bar = tqdm(total=steps_per_epoch, desc="Training...", position=1, leave=False)
+        # train
+        for batch in train_loader:
+            batch = shard(batch)
+            state, train_metric = p_train_step(state, batch)
+            train_metrics.append(train_metric)
+
+            train_step_progress_bar.update(1)
+
+        train_time += time.time() - train_start
+
+        train_metric = unreplicate(train_metric)
+
+        train_step_progress_bar.close()
+        epochs.write(
+            f"Epoch... ({epoch + 1}/{num_epochs} | Loss: {train_metric['loss']}, Learning Rate: {train_metric['learning_rate']})"
+        )
+
+        # ======================== Evaluating ==============================
+        eval_metrics = []
+        eval_steps = len(eval_dataset) // eval_batch_size
+        eval_step_progress_bar = tqdm(total=eval_steps, desc="Evaluating...", position=2, leave=False)
+        for batch in eval_loader:
+            # Model forward
+            batch = shard(batch)
+            metrics = p_eval_step(state.params, batch)
+            eval_metrics.append(metrics)
+
+            eval_step_progress_bar.update(1)
+
+        # normalize eval metrics
+        eval_metrics = get_metrics(eval_metrics)
+        eval_metrics = jax.tree_map(jnp.mean, eval_metrics)
+
+        # Print metrics and update progress bar
+        eval_step_progress_bar.close()
+        desc = (
+            f"Epoch... ({epoch + 1}/{num_epochs} | Eval Loss: {round(eval_metrics['loss'].item(), 4)} | "
+            f"Eval Accuracy: {round(eval_metrics['accuracy'].item(), 4)})"
+        )
+        epochs.write(desc)
+        epochs.desc = desc
+
+        # Save metrics
+        if has_tensorboard and jax.process_index() == 0:
+            cur_step = epoch * (len(train_dataset) // train_batch_size)
+            write_metric(summary_writer, train_metrics, eval_metrics, train_time, cur_step)
+
+        # save checkpoint after each epoch and push checkpoint to the hub
+        if jax.process_index() == 0:
+            params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+            model.save_pretrained(
+                training_args.output_dir,
+                params=params,
+                push_to_hub=training_args.push_to_hub,
+                commit_message=f"Saving weights and logs of epoch {epoch+1}",
+            )
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/legacy/multiple_choice/run_multiple_choice.py
+++ b/examples/legacy/multiple_choice/run_multiple_choice.py
@@ -107,7 +107,7 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
--- a/examples/legacy/question-answering/run_squad.py
+++ b/examples/legacy/question-answering/run_squad.py
@@ -702,7 +702,7 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
--- a/examples/legacy/question-answering/run_squad_trainer.py
+++ b/examples/legacy/question-answering/run_squad_trainer.py
@@ -89,7 +89,7 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
--- a/examples/legacy/run_language_modeling.py
+++ b/examples/legacy/run_language_modeling.py
@@ -211,7 +211,7 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
--- a/examples/legacy/run_openai_gpt.py
+++ b/examples/legacy/run_openai_gpt.py
@@ -50,7 +50,7 @@ from transformers import (


 logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
 )
 logger = logging.getLogger(__name__)

--- a/examples/legacy/run_swag.py
+++ b/examples/legacy/run_swag.py
@@ -617,7 +617,7 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
--- a/examples/legacy/run_transfo_xl.py
+++ b/examples/legacy/run_transfo_xl.py
@@ -33,7 +33,7 @@ from transformers import TransfoXLCorpus, TransfoXLLMHeadModel


 logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
 )
 logger = logging.getLogger(__name__)

--- a/examples/legacy/seq2seq/finetune_trainer.py
+++ b/examples/legacy/seq2seq/finetune_trainer.py
@@ -163,7 +163,7 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
--- a/examples/legacy/text-classification/run_tf_text_classification.py
+++ b/examples/legacy/text-classification/run_tf_text_classification.py
@@ -220,7 +220,7 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
--- a/examples/legacy/token-classification/run_ner.py
+++ b/examples/legacy/token-classification/run_ner.py
@@ -131,7 +131,7 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
--- a/examples/legacy/token-classification/run_tf_ner.py
+++ b/examples/legacy/token-classification/run_tf_ner.py
@@ -127,7 +127,7 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
--- a/examples/pytorch/language-modeling/run_clm.py
+++ b/examples/pytorch/language-modeling/run_clm.py
@@ -28,6 +28,7 @@ import sys
 from dataclasses import dataclass, field
 from typing import Optional

+import datasets
 from datasets import load_dataset

 import transformers
@@ -50,7 +51,8 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.8.0.dev0")
+check_min_version("4.9.0")
+
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

 logger = logging.getLogger(__name__)
@@ -198,22 +200,23 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
-    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if training_args.should_log:
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
@@ -245,15 +248,17 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
-        if "validation" not in datasets.keys():
-            datasets["validation"] = load_dataset(
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
            )
-            datasets["train"] = load_dataset(
+            raw_datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
@@ -272,7 +277,22 @@ def main():
        )
        if extension == "txt":
            extension = "text"
-        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            raw_datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -333,9 +353,9 @@ def main():
    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
-        column_names = datasets["train"].column_names
+        column_names = raw_datasets["train"].column_names
    else:
-        column_names = datasets["validation"].column_names
+        column_names = raw_datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    # since this will be pickled to avoid _LazyModule error in Hasher force logger loading before tokenize_function
@@ -351,14 +371,15 @@ def main():
            )
        return output

-    tokenized_datasets = datasets.map(
-        tokenize_function,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        remove_columns=column_names,
-        load_from_cache_file=not data_args.overwrite_cache,
-        desc="Running tokenizer on dataset",
-    )
+    with training_args.main_process_first(desc="dataset map tokenization"):
+        tokenized_datasets = raw_datasets.map(
+            tokenize_function,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            remove_columns=column_names,
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc="Running tokenizer on dataset",
+        )

    if data_args.block_size is None:
        block_size = tokenizer.model_max_length
@@ -383,7 +404,8 @@ def main():
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
-        total_length = (total_length // block_size) * block_size
+        if total_length >= block_size:
+            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
@@ -399,13 +421,14 @@ def main():
    # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
    # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

-    lm_datasets = tokenized_datasets.map(
-        group_texts,
-        batched=True,
-        num_proc=data_args.preprocessing_num_workers,
-        load_from_cache_file=not data_args.overwrite_cache,
-        desc=f"Grouping texts in chunks of {block_size}",
-    )
+    with training_args.main_process_first(desc="grouping texts together"):
+        lm_datasets = tokenized_datasets.map(
+            group_texts,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc=f"Grouping texts in chunks of {block_size}",
+        )

    if training_args.do_train:
        if "train" not in tokenized_datasets:
--- a/examples/pytorch/language-modeling/run_clm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_clm_no_trainer.py
@@ -35,7 +35,7 @@ from torch.utils.data.dataloader import DataLoader
 from tqdm.auto import tqdm

 import transformers
-from accelerate import Accelerator
+from accelerate import Accelerator, DistributedType
 from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
@@ -200,7 +200,7 @@ def main():
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
@@ -253,6 +253,19 @@ def main():
        if extension == "txt":
            extension = "text"
        raw_datasets = load_dataset(extension, data_files=data_files)
+        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{args.validation_split_percentage}%]",
+            )
+            raw_datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{args.validation_split_percentage}%:]",
+            )
+
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -330,7 +343,8 @@ def main():
        total_length = len(concatenated_examples[list(examples.keys())[0]])
        # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
-        total_length = (total_length // block_size) * block_size
+        if total_length >= block_size:
+            total_length = (total_length // block_size) * block_size
        # Split by chunks of max_len.
        result = {
            k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
@@ -389,6 +403,10 @@ def main():
        model, optimizer, train_dataloader, eval_dataloader
    )

+    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
+    if accelerator.distributed_type == DistributedType.TPU:
+        model.tie_weights()
+
    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

--- a/examples/pytorch/language-modeling/run_mlm.py
+++ b/examples/pytorch/language-modeling/run_mlm.py
@@ -28,6 +28,7 @@ import sys
 from dataclasses import dataclass, field
 from typing import Optional

+import datasets
 from datasets import load_dataset

 import transformers
@@ -49,7 +50,8 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.8.0.dev0")
+check_min_version("4.9.0")
+
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

 logger = logging.getLogger(__name__)
@@ -207,11 +209,17 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
-    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
@@ -219,10 +227,6 @@ def main():
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
    # Set the verbosity to info of the Transformers logger (on main process only):
-    if training_args.should_log:
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
@@ -254,15 +258,17 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
-        if "validation" not in datasets.keys():
-            datasets["validation"] = load_dataset(
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
            )
-            datasets["train"] = load_dataset(
+            raw_datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
@@ -272,12 +278,29 @@ def main():
        data_files = {}
        if data_args.train_file is not None:
            data_files["train"] = data_args.train_file
+            extension = data_args.train_file.split(".")[-1]
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
-        extension = data_args.train_file.split(".")[-1]
+            extension = data_args.validation_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
-        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+
+        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            raw_datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -336,9 +359,9 @@ def main():
    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
-        column_names = datasets["train"].column_names
+        column_names = raw_datasets["train"].column_names
    else:
-        column_names = datasets["validation"].column_names
+        column_names = raw_datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if data_args.max_seq_length is None:
@@ -376,14 +399,15 @@ def main():
                return_special_tokens_mask=True,
            )

-        tokenized_datasets = datasets.map(
-            tokenize_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=[text_column_name],
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on dataset line_by_line",
-        )
+        with training_args.main_process_first(desc="dataset map tokenization"):
+            tokenized_datasets = raw_datasets.map(
+                tokenize_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=[text_column_name],
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on dataset line_by_line",
+            )
    else:
        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
        # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
@@ -391,14 +415,15 @@ def main():
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name], return_special_tokens_mask=True)

-        tokenized_datasets = datasets.map(
-            tokenize_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on every text in dataset",
-        )
+        with training_args.main_process_first(desc="dataset map tokenization"):
+            tokenized_datasets = raw_datasets.map(
+                tokenize_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on every text in dataset",
+            )

        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
@@ -408,7 +433,8 @@ def main():
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
-            total_length = (total_length // max_seq_length) * max_seq_length
+            if total_length >= max_seq_length:
+                total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
@@ -423,13 +449,14 @@ def main():
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

-        tokenized_datasets = tokenized_datasets.map(
-            group_texts,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc=f"Grouping texts in chunks of {max_seq_length}",
-        )
+        with training_args.main_process_first(desc="grouping texts together"):
+            tokenized_datasets = tokenized_datasets.map(
+                group_texts,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc=f"Grouping texts in chunks of {max_seq_length}",
+            )

    if training_args.do_train:
        if "train" not in tokenized_datasets:
--- a/examples/pytorch/language-modeling/run_mlm_no_trainer.py
+++ b/examples/pytorch/language-modeling/run_mlm_no_trainer.py
@@ -35,7 +35,7 @@ from torch.utils.data.dataloader import DataLoader
 from tqdm.auto import tqdm

 import transformers
-from accelerate import Accelerator
+from accelerate import Accelerator, DistributedType
 from transformers import (
    CONFIG_MAPPING,
    MODEL_MAPPING,
@@ -212,7 +212,7 @@ def main():
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
@@ -265,6 +265,19 @@ def main():
        if extension == "txt":
            extension = "text"
        raw_datasets = load_dataset(extension, data_files=data_files)
+        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{args.validation_split_percentage}%]",
+            )
+            raw_datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{args.validation_split_percentage}%:]",
+            )
+
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -374,7 +387,8 @@ def main():
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
-            total_length = (total_length // max_seq_length) * max_seq_length
+            if total_length >= max_seq_length:
+                total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
@@ -434,6 +448,10 @@ def main():
        model, optimizer, train_dataloader, eval_dataloader
    )

+    # On TPU, the tie weights in our model have been disconnected, so we need to restore the ties.
+    if accelerator.distributed_type == DistributedType.TPU:
+        model.tie_weights()
+
    # Note -> the training dataloader needs to be prepared before we grab his length below (cause its length will be
    # shorter in multiprocess)

--- a/examples/pytorch/language-modeling/run_plm.py
+++ b/examples/pytorch/language-modeling/run_plm.py
@@ -25,6 +25,7 @@ import sys
 from dataclasses import dataclass, field
 from typing import Optional

+import datasets
 from datasets import load_dataset

 import transformers
@@ -45,7 +46,8 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.8.0.dev0")
+check_min_version("4.9.0")
+
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")

 logger = logging.getLogger(__name__)
@@ -204,22 +206,23 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
-    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if training_args.should_log:
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
@@ -251,15 +254,17 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
-        if "validation" not in datasets.keys():
-            datasets["validation"] = load_dataset(
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[:{data_args.validation_split_percentage}%]",
                cache_dir=model_args.cache_dir,
            )
-            datasets["train"] = load_dataset(
+            raw_datasets["train"] = load_dataset(
                data_args.dataset_name,
                data_args.dataset_config_name,
                split=f"train[{data_args.validation_split_percentage}%:]",
@@ -274,7 +279,22 @@ def main():
        extension = data_args.train_file.split(".")[-1]
        if extension == "txt":
            extension = "text"
-        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        # If no validation data is there, validation_split_percentage will be used to divide the dataset.
+        if "validation" not in raw_datasets.keys():
+            raw_datasets["validation"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[:{data_args.validation_split_percentage}%]",
+                cache_dir=model_args.cache_dir,
+            )
+            raw_datasets["train"] = load_dataset(
+                extension,
+                data_files=data_files,
+                split=f"train[{data_args.validation_split_percentage}%:]",
+                cache_dir=model_args.cache_dir,
+            )
+
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -333,9 +353,9 @@ def main():
    # Preprocessing the datasets.
    # First we tokenize all the texts.
    if training_args.do_train:
-        column_names = datasets["train"].column_names
+        column_names = raw_datasets["train"].column_names
    else:
-        column_names = datasets["validation"].column_names
+        column_names = raw_datasets["validation"].column_names
    text_column_name = "text" if "text" in column_names else column_names[0]

    if data_args.max_seq_length > tokenizer.model_max_length:
@@ -354,27 +374,29 @@ def main():
            examples["text"] = [line for line in examples["text"] if len(line) > 0 and not line.isspace()]
            return tokenizer(examples["text"], padding=padding, truncation=True, max_length=max_seq_length)

-        tokenized_datasets = datasets.map(
-            tokenize_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=[text_column_name],
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on dataset line_by_line",
-        )
+        with training_args.main_process_first(desc="dataset map tokenization"):
+            tokenized_datasets = raw_datasets.map(
+                tokenize_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=[text_column_name],
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on dataset line_by_line",
+            )
    else:
        # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
        def tokenize_function(examples):
            return tokenizer(examples[text_column_name])

-        tokenized_datasets = datasets.map(
-            tokenize_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on every text in dataset",
-        )
+        with training_args.main_process_first(desc="dataset map tokenization"):
+            tokenized_datasets = raw_datasets.map(
+                tokenize_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on every text in dataset",
+            )

        # Main data processing function that will concatenate all texts from our dataset and generate chunks of
        # max_seq_length.
@@ -384,7 +406,8 @@ def main():
            total_length = len(concatenated_examples[list(examples.keys())[0]])
            # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
            # customize this part to your needs.
-            total_length = (total_length // max_seq_length) * max_seq_length
+            if total_length >= max_seq_length:
+                total_length = (total_length // max_seq_length) * max_seq_length
            # Split by chunks of max_len.
            result = {
                k: [t[i : i + max_seq_length] for i in range(0, total_length, max_seq_length)]
@@ -399,13 +422,14 @@ def main():
        # To speed up this part, we use multiprocessing. See the documentation of the map method for more information:
        # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.map

-        tokenized_datasets = tokenized_datasets.map(
-            group_texts,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc=f"Grouping texts in chunks of {max_seq_length}",
-        )
+        with training_args.main_process_first(desc="grouping texts together"):
+            tokenized_datasets = tokenized_datasets.map(
+                group_texts,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc=f"Grouping texts in chunks of {max_seq_length}",
+            )

    if training_args.do_train:
        if "train" not in tokenized_datasets:
--- a/examples/pytorch/multiple-choice/run_swag.py
+++ b/examples/pytorch/multiple-choice/run_swag.py
@@ -24,6 +24,7 @@ import sys
 from dataclasses import dataclass, field
 from typing import Optional, Union

+import datasets
 import numpy as np
 import torch
 from datasets import load_dataset
@@ -46,7 +47,7 @@ from transformers.utils import check_min_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.8.0.dev0")
+check_min_version("4.9.0")

 logger = logging.getLogger(__name__)

@@ -105,7 +106,7 @@ class DataTrainingArguments:
        default=None,
        metadata={"help": "The number of processes to use for the preprocessing."},
    )
-    max_seq_length: int = field(
+    max_seq_length: Optional[int] = field(
        default=None,
        metadata={
            "help": "The maximum total input sequence length after tokenization. If passed, sequences longer "
@@ -216,22 +217,22 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
-    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if training_args.should_log:
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
@@ -268,10 +269,10 @@ def main():
        if data_args.validation_file is not None:
            data_files["validation"] = data_args.validation_file
        extension = data_args.train_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
    else:
        # Downloading and loading the swag dataset from the hub.
-        datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset("swag", "regular", cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -347,30 +348,32 @@ def main():
        return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

    if training_args.do_train:
-        if "train" not in datasets:
+        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
-        train_dataset = datasets["train"]
+        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-        train_dataset = train_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
+        with training_args.main_process_first(desc="train dataset map pre-processing"):
+            train_dataset = train_dataset.map(
+                preprocess_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+            )

    if training_args.do_eval:
-        if "validation" not in datasets:
+        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = datasets["validation"]
+        eval_dataset = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
-        eval_dataset = eval_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-        )
+        with training_args.main_process_first(desc="validation dataset map pre-processing"):
+            eval_dataset = eval_dataset.map(
+                preprocess_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+            )

    # Data collator
    data_collator = (
--- a/examples/pytorch/multiple-choice/run_swag_no_trainer.py
+++ b/examples/pytorch/multiple-choice/run_swag_no_trainer.py
@@ -240,7 +240,7 @@ def main():
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
--- a/examples/pytorch/question-answering/run_qa.py
+++ b/examples/pytorch/question-answering/run_qa.py
@@ -24,6 +24,7 @@ import sys
 from dataclasses import dataclass, field
 from typing import Optional

+import datasets
 from datasets import load_dataset, load_metric

 import transformers
@@ -47,7 +48,8 @@ from utils_qa import postprocess_qa_predictions


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.8.0.dev0")
+check_min_version("4.9.0")
+
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

 logger = logging.getLogger(__name__)
@@ -211,22 +213,23 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
-    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if training_args.should_log:
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
@@ -258,7 +261,9 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
    else:
        data_files = {}
        if data_args.train_file is not None:
@@ -271,7 +276,7 @@ def main():
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -313,11 +318,11 @@ def main():
    # Preprocessing the datasets.
    # Preprocessing is slighlty different for training and evaluation.
    if training_args.do_train:
-        column_names = datasets["train"].column_names
+        column_names = raw_datasets["train"].column_names
    elif training_args.do_eval:
-        column_names = datasets["validation"].column_names
+        column_names = raw_datasets["validation"].column_names
    else:
-        column_names = datasets["test"].column_names
+        column_names = raw_datasets["test"].column_names
    question_column_name = "question" if "question" in column_names else column_names[0]
    context_column_name = "context" if "context" in column_names else column_names[1]
    answer_column_name = "answers" if "answers" in column_names else column_names[2]
@@ -406,21 +411,22 @@ def main():
        return tokenized_examples

    if training_args.do_train:
-        if "train" not in datasets:
+        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
-        train_dataset = datasets["train"]
+        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            # We will select sample from whole data if agument is specified
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
        # Create train feature from dataset
-        train_dataset = train_dataset.map(
-            prepare_train_features,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on train dataset",
-        )
+        with training_args.main_process_first(desc="train dataset map pre-processing"):
+            train_dataset = train_dataset.map(
+                prepare_train_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on train dataset",
+            )
        if data_args.max_train_samples is not None:
            # Number of samples might increase during Feature Creation, We select only specified max samples
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
@@ -468,41 +474,43 @@ def main():
        return tokenized_examples

    if training_args.do_eval:
-        if "validation" not in datasets:
+        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
-        eval_examples = datasets["validation"]
+        eval_examples = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            # We will select sample from whole data
            eval_examples = eval_examples.select(range(data_args.max_eval_samples))
        # Validation Feature Creation
-        eval_dataset = eval_examples.map(
-            prepare_validation_features,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on validation dataset",
-        )
+        with training_args.main_process_first(desc="validation dataset map pre-processing"):
+            eval_dataset = eval_examples.map(
+                prepare_validation_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on validation dataset",
+            )
        if data_args.max_eval_samples is not None:
            # During Feature creation dataset samples might increase, we will select required samples again
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))

    if training_args.do_predict:
-        if "test" not in datasets:
+        if "test" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
-        predict_examples = datasets["test"]
+        predict_examples = raw_datasets["test"]
        if data_args.max_predict_samples is not None:
            # We will select sample from whole data
            predict_examples = predict_examples.select(range(data_args.max_predict_samples))
        # Predict Feature Creation
-        predict_dataset = predict_examples.map(
-            prepare_validation_features,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on prediction dataset",
-        )
+        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
+            predict_dataset = predict_examples.map(
+                prepare_validation_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on prediction dataset",
+            )
        if data_args.max_predict_samples is not None:
            # During Feature creation dataset samples might increase, we will select required samples again
            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
@@ -528,7 +536,7 @@ def main():
            max_answer_length=data_args.max_answer_length,
            null_score_diff_threshold=data_args.null_score_diff_threshold,
            output_dir=training_args.output_dir,
-            is_world_process_zero=trainer.is_world_process_zero(),
+            log_level=log_level,
            prefix=stage,
        )
        # Format the result to the format the metric expects.
--- a/examples/pytorch/question-answering/run_qa_beam_search.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search.py
@@ -24,6 +24,7 @@ import sys
 from dataclasses import dataclass, field
 from typing import Optional

+import datasets
 from datasets import load_dataset, load_metric

 import transformers
@@ -46,7 +47,8 @@ from utils_qa import postprocess_qa_predictions_with_beam_search


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.8.0.dev0")
+check_min_version("4.9.0")
+
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

 logger = logging.getLogger(__name__)
@@ -210,22 +212,22 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
-    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if training_args.should_log:
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
@@ -257,7 +259,9 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
    else:
        data_files = {}
        if data_args.train_file is not None:
@@ -269,7 +273,7 @@ def main():
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset(extension, data_files=data_files, field="data", cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -302,11 +306,11 @@ def main():
    # Preprocessing the datasets.
    # Preprocessing is slighlty different for training and evaluation.
    if training_args.do_train:
-        column_names = datasets["train"].column_names
+        column_names = raw_datasets["train"].column_names
    elif training_args.do_eval:
-        column_names = datasets["validation"].column_names
+        column_names = raw_datasets["validation"].column_names
    else:
-        column_names = datasets["test"].column_names
+        column_names = raw_datasets["test"].column_names
    question_column_name = "question" if "question" in column_names else column_names[0]
    context_column_name = "context" if "context" in column_names else column_names[1]
    answer_column_name = "answers" if "answers" in column_names else column_names[2]
@@ -418,21 +422,22 @@ def main():
        return tokenized_examples

    if training_args.do_train:
-        if "train" not in datasets:
+        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
-        train_dataset = datasets["train"]
+        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            # Select samples from Dataset, This will help to decrease processing time
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
        # Create Training Features
-        train_dataset = train_dataset.map(
-            prepare_train_features,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on train dataset",
-        )
+        with training_args.main_process_first(desc="train dataset map pre-processing"):
+            train_dataset = train_dataset.map(
+                prepare_train_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on train dataset",
+            )
        if data_args.max_train_samples is not None:
            # Select samples from dataset again since Feature Creation might increase number of features
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
@@ -504,41 +509,43 @@ def main():
        return tokenized_examples

    if training_args.do_eval:
-        if "validation" not in datasets:
+        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
-        eval_examples = datasets["validation"]
+        eval_examples = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            # Selecting Eval Samples from Dataset
            eval_examples = eval_examples.select(range(data_args.max_eval_samples))
        # Create Features from Eval Dataset
-        eval_dataset = eval_examples.map(
-            prepare_validation_features,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on validation dataset",
-        )
+        with training_args.main_process_first(desc="validation dataset map pre-processing"):
+            eval_dataset = eval_examples.map(
+                prepare_validation_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on validation dataset",
+            )
        if data_args.max_eval_samples is not None:
            # Selecting Samples from Dataset again since Feature Creation might increase samples size
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))

    if training_args.do_predict:
-        if "test" not in datasets:
+        if "test" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
-        predict_examples = datasets["test"]
+        predict_examples = raw_datasets["test"]
        if data_args.max_predict_samples is not None:
            # We will select sample from whole data
            predict_examples = predict_examples.select(range(data_args.max_predict_samples))
        # Test Feature Creation
-        predict_dataset = predict_examples.map(
-            prepare_validation_features,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on prediction dataset",
-        )
+        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
+            predict_dataset = predict_examples.map(
+                prepare_validation_features,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on prediction dataset",
+            )
        if data_args.max_predict_samples is not None:
            # During Feature creation dataset samples might increase, we will select required samples again
            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
@@ -565,7 +572,7 @@ def main():
            start_n_top=model.config.start_n_top,
            end_n_top=model.config.end_n_top,
            output_dir=training_args.output_dir,
-            is_world_process_zero=trainer.is_world_process_zero(),
+            log_level=log_level,
            prefix=stage,
        )
        # Format the result to the format the metric expects.
--- a/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_beam_search_no_trainer.py
@@ -51,7 +51,8 @@ from utils_qa import postprocess_qa_predictions_with_beam_search


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.8.0.dev0")
+check_min_version("4.9.0")
+
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

 logger = logging.getLogger(__name__)
@@ -237,7 +238,7 @@ def main():
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
--- a/examples/pytorch/question-answering/run_qa_no_trainer.py
+++ b/examples/pytorch/question-answering/run_qa_no_trainer.py
@@ -53,7 +53,8 @@ from utils_qa import postprocess_qa_predictions


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.8.0.dev0")
+check_min_version("4.9.0")
+
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")

 logger = logging.getLogger(__name__)
@@ -266,7 +267,7 @@ def main():
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
--- a/examples/pytorch/question-answering/utils_qa.py
+++ b/examples/pytorch/question-answering/utils_qa.py
@@ -38,7 +38,7 @@ def postprocess_qa_predictions(
    null_score_diff_threshold: float = 0.0,
    output_dir: Optional[str] = None,
    prefix: Optional[str] = None,
-    is_world_process_zero: bool = True,
+    log_level: Optional[int] = logging.WARNING,
 ):
    """
    Post-processes the predictions of a question-answering model to convert them to answers that are substrings of the
@@ -70,8 +70,8 @@ def postprocess_qa_predictions(
            answers, are saved in `output_dir`.
        prefix (:obj:`str`, `optional`):
            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
-        is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this process is the main process or not (used to determine if logging/saves should be done).
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
    """
    assert len(predictions) == 2, "`predictions` should be a tuple with two elements (start_logits, end_logits)."
    all_start_logits, all_end_logits = predictions
@@ -91,7 +91,7 @@ def postprocess_qa_predictions(
        scores_diff_json = collections.OrderedDict()

    # Logging.
-    logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN)
+    logger.setLevel(log_level)
    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
@@ -250,7 +250,7 @@ def postprocess_qa_predictions_with_beam_search(
    end_n_top: int = 5,
    output_dir: Optional[str] = None,
    prefix: Optional[str] = None,
-    is_world_process_zero: bool = True,
+    log_level: Optional[int] = logging.WARNING,
 ):
    """
    Post-processes the predictions of a question-answering model with beam search to convert them to answers that are substrings of the
@@ -280,8 +280,8 @@ def postprocess_qa_predictions_with_beam_search(
            answers, are saved in `output_dir`.
        prefix (:obj:`str`, `optional`):
            If provided, the dictionaries mentioned above are saved with `prefix` added to their names.
-        is_world_process_zero (:obj:`bool`, `optional`, defaults to :obj:`True`):
-            Whether this process is the main process or not (used to determine if logging/saves should be done).
+        log_level (:obj:`int`, `optional`, defaults to ``logging.WARNING``):
+            ``logging`` log level (e.g., ``logging.WARNING``)
    """
    assert len(predictions) == 5, "`predictions` should be a tuple with five elements."
    start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits = predictions
@@ -302,7 +302,7 @@ def postprocess_qa_predictions_with_beam_search(
    scores_diff_json = collections.OrderedDict() if version_2_with_negative else None

    # Logging.
-    logger.setLevel(logging.INFO if is_world_process_zero else logging.WARN)
+    logger.setLevel(log_level)
    logger.info(f"Post-processing {len(examples)} example predictions split into {len(features)} features.")

    # Let's loop over all the examples!
@@ -413,14 +413,14 @@ def postprocess_qa_predictions_with_beam_search(
                output_dir, "null_odds.json" if prefix is None else f"{prefix}_null_odds.json"
            )

-        print(f"Saving predictions to {prediction_file}.")
+        logger.info(f"Saving predictions to {prediction_file}.")
        with open(prediction_file, "w") as writer:
            writer.write(json.dumps(all_predictions, indent=4) + "\n")
-        print(f"Saving nbest_preds to {nbest_file}.")
+        logger.info(f"Saving nbest_preds to {nbest_file}.")
        with open(nbest_file, "w") as writer:
            writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
        if version_2_with_negative:
-            print(f"Saving null_odds to {null_odds_file}.")
+            logger.info(f"Saving null_odds to {null_odds_file}.")
            with open(null_odds_file, "w") as writer:
                writer.write(json.dumps(scores_diff_json, indent=4) + "\n")

--- a/examples/pytorch/summarization/run_summarization.py
+++ b/examples/pytorch/summarization/run_summarization.py
@@ -24,6 +24,7 @@ import sys
 from dataclasses import dataclass, field
 from typing import Optional

+import datasets
 import nltk  # Here to have a nice missing dependency error message early on
 import numpy as np
 from datasets import load_dataset, load_metric
@@ -47,7 +48,8 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.8.0.dev0")
+check_min_version("4.9.0")
+
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")

 logger = logging.getLogger(__name__)
@@ -255,20 +257,22 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
-    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if training_args.should_log:
-        transformers.utils.logging.set_verbosity_info()
    logger.info(f"Training/evaluation parameters {training_args}")

    if data_args.source_prefix is None and model_args.model_name_or_path in [
@@ -312,7 +316,9 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
    else:
        data_files = {}
        if data_args.train_file is not None:
@@ -324,7 +330,7 @@ def main():
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
            extension = data_args.test_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -365,11 +371,11 @@ def main():
    # Preprocessing the datasets.
    # We need to tokenize inputs and targets.
    if training_args.do_train:
-        column_names = datasets["train"].column_names
+        column_names = raw_datasets["train"].column_names
    elif training_args.do_eval:
-        column_names = datasets["validation"].column_names
+        column_names = raw_datasets["validation"].column_names
    elif training_args.do_predict:
-        column_names = datasets["test"].column_names
+        column_names = raw_datasets["test"].column_names
    else:
        logger.info("There is nothing to do. Please pass `do_train`, `do_eval` and/or `do_predict`.")
        return
@@ -424,51 +430,54 @@ def main():
        return model_inputs

    if training_args.do_train:
-        if "train" not in datasets:
+        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
-        train_dataset = datasets["train"]
+        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-        train_dataset = train_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on train dataset",
-        )
+        with training_args.main_process_first(desc="train dataset map pre-processing"):
+            train_dataset = train_dataset.map(
+                preprocess_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on train dataset",
+            )

    if training_args.do_eval:
        max_target_length = data_args.val_max_target_length
-        if "validation" not in datasets:
+        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = datasets["validation"]
+        eval_dataset = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
-        eval_dataset = eval_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on validation dataset",
-        )
+        with training_args.main_process_first(desc="validation dataset map pre-processing"):
+            eval_dataset = eval_dataset.map(
+                preprocess_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on validation dataset",
+            )

    if training_args.do_predict:
        max_target_length = data_args.val_max_target_length
-        if "test" not in datasets:
+        if "test" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
-        predict_dataset = datasets["test"]
+        predict_dataset = raw_datasets["test"]
        if data_args.max_predict_samples is not None:
            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
-        predict_dataset = predict_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on prediction dataset",
-        )
+        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
+            predict_dataset = predict_dataset.map(
+                preprocess_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on prediction dataset",
+            )

    # Data collator
    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
--- a/examples/pytorch/summarization/run_summarization_no_trainer.py
+++ b/examples/pytorch/summarization/run_summarization_no_trainer.py
@@ -293,7 +293,7 @@ def main():
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
--- a/examples/pytorch/text-classification/run_glue.py
+++ b/examples/pytorch/text-classification/run_glue.py
@@ -23,6 +23,7 @@ import sys
 from dataclasses import dataclass, field
 from typing import Optional

+import datasets
 import numpy as np
 from datasets import load_dataset, load_metric

@@ -46,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.8.0.dev0")
+check_min_version("4.9.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

@@ -200,22 +201,23 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
-    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if training_args.should_log:
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
@@ -250,10 +252,12 @@ def main():
    # download the dataset.
    if data_args.task_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset("glue", data_args.task_name, cache_dir=model_args.cache_dir)
    elif data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
    else:
        # Loading a dataset from your local files.
        # CSV/JSON training and evaluation files are needed.
@@ -277,10 +281,10 @@ def main():

        if data_args.train_file.endswith(".csv"):
            # Loading a dataset from local csv files
-            datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir)
+            raw_datasets = load_dataset("csv", data_files=data_files, cache_dir=model_args.cache_dir)
        else:
            # Loading a dataset from local json files
-            datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
+            raw_datasets = load_dataset("json", data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

@@ -288,19 +292,19 @@ def main():
    if data_args.task_name is not None:
        is_regression = data_args.task_name == "stsb"
        if not is_regression:
-            label_list = datasets["train"].features["label"].names
+            label_list = raw_datasets["train"].features["label"].names
            num_labels = len(label_list)
        else:
            num_labels = 1
    else:
        # Trying to have good defaults here, don't hesitate to tweak to your needs.
-        is_regression = datasets["train"].features["label"].dtype in ["float32", "float64"]
+        is_regression = raw_datasets["train"].features["label"].dtype in ["float32", "float64"]
        if is_regression:
            num_labels = 1
        else:
            # A useful fast method:
            # https://huggingface.co/docs/datasets/package_reference/main_classes.html#datasets.Dataset.unique
-            label_list = datasets["train"].unique("label")
+            label_list = raw_datasets["train"].unique("label")
            label_list.sort()  # Let's sort it for determinism
            num_labels = len(label_list)

@@ -332,12 +336,12 @@ def main():
        use_auth_token=True if model_args.use_auth_token else None,
    )

-    # Preprocessing the datasets
+    # Preprocessing the raw_datasets
    if data_args.task_name is not None:
        sentence1_key, sentence2_key = task_to_keys[data_args.task_name]
    else:
        # Again, we try to have some nice defaults but don't hesitate to tweak to your use case.
-        non_label_column_names = [name for name in datasets["train"].column_names if name != "label"]
+        non_label_column_names = [name for name in raw_datasets["train"].column_names if name != "label"]
        if "sentence1" in non_label_column_names and "sentence2" in non_label_column_names:
            sentence1_key, sentence2_key = "sentence1", "sentence2"
        else:
@@ -396,30 +400,31 @@ def main():
            result["label"] = [(label_to_id[l] if l != -1 else -1) for l in examples["label"]]
        return result

-    datasets = datasets.map(
-        preprocess_function,
-        batched=True,
-        load_from_cache_file=not data_args.overwrite_cache,
-        desc="Running tokenizer on dataset",
-    )
+    with training_args.main_process_first(desc="dataset map pre-processing"):
+        raw_datasets = raw_datasets.map(
+            preprocess_function,
+            batched=True,
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc="Running tokenizer on dataset",
+        )
    if training_args.do_train:
-        if "train" not in datasets:
+        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
-        train_dataset = datasets["train"]
+        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(data_args.max_train_samples))

    if training_args.do_eval:
-        if "validation" not in datasets and "validation_matched" not in datasets:
+        if "validation" not in raw_datasets and "validation_matched" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
+        eval_dataset = raw_datasets["validation_matched" if data_args.task_name == "mnli" else "validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))

    if training_args.do_predict or data_args.task_name is not None or data_args.test_file is not None:
-        if "test" not in datasets and "test_matched" not in datasets:
+        if "test" not in raw_datasets and "test_matched" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
-        predict_dataset = datasets["test_matched" if data_args.task_name == "mnli" else "test"]
+        predict_dataset = raw_datasets["test_matched" if data_args.task_name == "mnli" else "test"]
        if data_args.max_predict_samples is not None:
            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))

@@ -497,7 +502,7 @@ def main():
        eval_datasets = [eval_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
-            eval_datasets.append(datasets["validation_mismatched"])
+            eval_datasets.append(raw_datasets["validation_mismatched"])

        for eval_dataset, task in zip(eval_datasets, tasks):
            metrics = trainer.evaluate(eval_dataset=eval_dataset)
@@ -518,11 +523,11 @@ def main():
        predict_datasets = [predict_dataset]
        if data_args.task_name == "mnli":
            tasks.append("mnli-mm")
-            predict_datasets.append(datasets["test_mismatched"])
+            predict_datasets.append(raw_datasets["test_mismatched"])

        for predict_dataset, task in zip(predict_datasets, tasks):
            # Removing the `label` columns because it contains -1 and Trainer won't like that.
-            predict_dataset.remove_columns_("label")
+            predict_dataset = predict_dataset.remove_columns("label")
            predictions = trainer.predict(predict_dataset, metric_key_prefix="predict").predictions
            predictions = np.squeeze(predictions) if is_regression else np.argmax(predictions, axis=1)

--- a/examples/pytorch/text-classification/run_glue_no_trainer.py
+++ b/examples/pytorch/text-classification/run_glue_no_trainer.py
@@ -168,7 +168,7 @@ def main():
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
--- a/examples/pytorch/text-classification/run_xnli.py
+++ b/examples/pytorch/text-classification/run_xnli.py
@@ -24,6 +24,7 @@ import sys
 from dataclasses import dataclass, field
 from typing import Optional

+import datasets
 import numpy as np
 from datasets import load_dataset, load_metric

@@ -46,7 +47,7 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.8.0.dev0")
+check_min_version("4.9.0")

 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")

@@ -170,23 +171,23 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
-    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
-
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if training_args.should_log:
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
@@ -279,12 +280,13 @@ def main():
    if training_args.do_train:
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-        train_dataset = train_dataset.map(
-            preprocess_function,
-            batched=True,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on train dataset",
-        )
+        with training_args.main_process_first(desc="train dataset map pre-processing"):
+            train_dataset = train_dataset.map(
+                preprocess_function,
+                batched=True,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on train dataset",
+            )
        # Log a few random samples from the training set:
        for index in random.sample(range(len(train_dataset)), 3):
            logger.info(f"Sample {index} of the training set: {train_dataset[index]}.")
@@ -292,22 +294,24 @@ def main():
    if training_args.do_eval:
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
-        eval_dataset = eval_dataset.map(
-            preprocess_function,
-            batched=True,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on validation dataset",
-        )
+        with training_args.main_process_first(desc="validation dataset map pre-processing"):
+            eval_dataset = eval_dataset.map(
+                preprocess_function,
+                batched=True,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on validation dataset",
+            )

    if training_args.do_predict:
        if data_args.max_predict_samples is not None:
            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
-        predict_dataset = predict_dataset.map(
-            preprocess_function,
-            batched=True,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on prediction dataset",
-        )
+        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
+            predict_dataset = predict_dataset.map(
+                preprocess_function,
+                batched=True,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on prediction dataset",
+            )

    # Get the metric function
    metric = load_metric("xnli")
--- a/examples/pytorch/text-generation/run_generation.py
+++ b/examples/pytorch/text-generation/run_generation.py
@@ -41,7 +41,7 @@ from transformers import (


 logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
    datefmt="%m/%d/%Y %H:%M:%S",
    level=logging.INFO,
 )
--- a/examples/pytorch/token-classification/README.md
+++ b/examples/pytorch/token-classification/README.md
@@ -19,7 +19,7 @@ limitations under the License.
 ## PyTorch version

 Fine-tuning the library models for token classification task such as Named Entity Recognition (NER), Parts-of-speech
-tagging (POS) pr phrase extraction (CHUNKS). The main scrip `run_ner.py` leverages the 🤗 Datasets library and the Trainer API. You can easily
+tagging (POS) or phrase extraction (CHUNKS). The main scrip `run_ner.py` leverages the 🤗 Datasets library and the Trainer API. You can easily
 customize it to your needs if you need extra processing on your datasets.

 It will either run on a datasets hosted on our [hub](https://huggingface.co/datasets) or with your own text files for
--- a/examples/pytorch/token-classification/run_ner.py
+++ b/examples/pytorch/token-classification/run_ner.py
@@ -25,6 +25,7 @@ import sys
 from dataclasses import dataclass, field
 from typing import Optional

+import datasets
 import numpy as np
 from datasets import ClassLabel, load_dataset, load_metric

@@ -46,7 +47,8 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.8.0.dev0")
+check_min_version("4.9.0")
+
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/token-classification/requirements.txt")

 logger = logging.getLogger(__name__)
@@ -190,22 +192,23 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
-    logger.setLevel(logging.INFO if training_args.should_log else logging.WARN)
+
+    log_level = training_args.get_process_log_level()
+    logger.setLevel(log_level)
+    datasets.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )
-    # Set the verbosity to info of the Transformers logger (on main process only):
-    if training_args.should_log:
-        transformers.utils.logging.set_verbosity_info()
-        transformers.utils.logging.enable_default_handler()
-        transformers.utils.logging.enable_explicit_format()
    logger.info(f"Training/evaluation parameters {training_args}")

    # Detecting last checkpoint.
@@ -237,7 +240,9 @@ def main():
    # download the dataset.
    if data_args.dataset_name is not None:
        # Downloading and loading a dataset from the hub.
-        datasets = load_dataset(data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset(
+            data_args.dataset_name, data_args.dataset_config_name, cache_dir=model_args.cache_dir
+        )
    else:
        data_files = {}
        if data_args.train_file is not None:
@@ -247,16 +252,16 @@ def main():
        if data_args.test_file is not None:
            data_files["test"] = data_args.test_file
        extension = data_args.train_file.split(".")[-1]
-        datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
+        raw_datasets = load_dataset(extension, data_files=data_files, cache_dir=model_args.cache_dir)
    # See more about loading any type of standard or custom dataset (from files, python dict, pandas DataFrame, etc) at
    # https://huggingface.co/docs/datasets/loading_datasets.html.

    if training_args.do_train:
-        column_names = datasets["train"].column_names
-        features = datasets["train"].features
+        column_names = raw_datasets["train"].column_names
+        features = raw_datasets["train"].features
    else:
-        column_names = datasets["validation"].column_names
-        features = datasets["validation"].features
+        column_names = raw_datasets["validation"].column_names
+        features = raw_datasets["validation"].features

    if data_args.text_column_name is not None:
        text_column_name = data_args.text_column_name
@@ -287,7 +292,7 @@ def main():
        # No need to convert the labels since they are already ints.
        label_to_id = {i: i for i in range(len(label_list))}
    else:
-        label_list = get_label_list(datasets["train"][label_column_name])
+        label_list = get_label_list(raw_datasets["train"][label_column_name])
        label_to_id = {l: i for i, l in enumerate(label_list)}
    num_labels = len(label_list)

@@ -380,46 +385,49 @@ def main():
        return tokenized_inputs

    if training_args.do_train:
-        if "train" not in datasets:
+        if "train" not in raw_datasets:
            raise ValueError("--do_train requires a train dataset")
-        train_dataset = datasets["train"]
+        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-        train_dataset = train_dataset.map(
-            tokenize_and_align_labels,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on train dataset",
-        )
+        with training_args.main_process_first(desc="train dataset map pre-processing"):
+            train_dataset = train_dataset.map(
+                tokenize_and_align_labels,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on train dataset",
+            )

    if training_args.do_eval:
-        if "validation" not in datasets:
+        if "validation" not in raw_datasets:
            raise ValueError("--do_eval requires a validation dataset")
-        eval_dataset = datasets["validation"]
+        eval_dataset = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
-        eval_dataset = eval_dataset.map(
-            tokenize_and_align_labels,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on validation dataset",
-        )
+        with training_args.main_process_first(desc="validation dataset map pre-processing"):
+            eval_dataset = eval_dataset.map(
+                tokenize_and_align_labels,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on validation dataset",
+            )

    if training_args.do_predict:
-        if "test" not in datasets:
+        if "test" not in raw_datasets:
            raise ValueError("--do_predict requires a test dataset")
-        predict_dataset = datasets["test"]
+        predict_dataset = raw_datasets["test"]
        if data_args.max_predict_samples is not None:
            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
-        predict_dataset = predict_dataset.map(
-            tokenize_and_align_labels,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on prediction dataset",
-        )
+        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
+            predict_dataset = predict_dataset.map(
+                tokenize_and_align_labels,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on prediction dataset",
+            )

    # Data collator
    data_collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=8 if training_args.fp16 else None)
--- a/examples/pytorch/token-classification/run_ner_no_trainer.py
+++ b/examples/pytorch/token-classification/run_ner_no_trainer.py
@@ -221,7 +221,7 @@ def main():
    accelerator = Accelerator()
    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
@@ -344,7 +344,7 @@ def main():

    model.resize_token_embeddings(len(tokenizer))

-    # Preprocessing the raw_datasets.
+    # Preprocessing the datasets.
    # First we tokenize all the texts.
    padding = "max_length" if args.pad_to_max_length else False

--- a/examples/pytorch/translation/run_translation.py
+++ b/examples/pytorch/translation/run_translation.py
@@ -51,7 +51,8 @@ from transformers.utils.versions import require_version


 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
-check_min_version("4.8.0.dev0")
+check_min_version("4.9.0")
+
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")

 logger = logging.getLogger(__name__)
@@ -240,7 +241,7 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        handlers=[logging.StreamHandler(sys.stdout)],
    )
@@ -249,6 +250,8 @@ def main():
    logger.setLevel(log_level)
    datasets.utils.logging.set_verbosity(log_level)
    transformers.utils.logging.set_verbosity(log_level)
+    transformers.utils.logging.enable_default_handler()
+    transformers.utils.logging.enable_explicit_format()

    # Log on each process the small summary:
    logger.warning(
@@ -427,14 +430,15 @@ def main():
        train_dataset = raw_datasets["train"]
        if data_args.max_train_samples is not None:
            train_dataset = train_dataset.select(range(data_args.max_train_samples))
-        train_dataset = train_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on train dataset",
-        )
+        with training_args.main_process_first(desc="train dataset map pre-processing"):
+            train_dataset = train_dataset.map(
+                preprocess_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on train dataset",
+            )

    if training_args.do_eval:
        max_target_length = data_args.val_max_target_length
@@ -443,14 +447,15 @@ def main():
        eval_dataset = raw_datasets["validation"]
        if data_args.max_eval_samples is not None:
            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
-        eval_dataset = eval_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on validation dataset",
-        )
+        with training_args.main_process_first(desc="validation dataset map pre-processing"):
+            eval_dataset = eval_dataset.map(
+                preprocess_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on validation dataset",
+            )

    if training_args.do_predict:
        max_target_length = data_args.val_max_target_length
@@ -459,14 +464,15 @@ def main():
        predict_dataset = raw_datasets["test"]
        if data_args.max_predict_samples is not None:
            predict_dataset = predict_dataset.select(range(data_args.max_predict_samples))
-        predict_dataset = predict_dataset.map(
-            preprocess_function,
-            batched=True,
-            num_proc=data_args.preprocessing_num_workers,
-            remove_columns=column_names,
-            load_from_cache_file=not data_args.overwrite_cache,
-            desc="Running tokenizer on prediction dataset",
-        )
+        with training_args.main_process_first(desc="prediction dataset map pre-processing"):
+            predict_dataset = predict_dataset.map(
+                preprocess_function,
+                batched=True,
+                num_proc=data_args.preprocessing_num_workers,
+                remove_columns=column_names,
+                load_from_cache_file=not data_args.overwrite_cache,
+                desc="Running tokenizer on prediction dataset",
+            )

    # Data collator
    label_pad_token_id = -100 if data_args.ignore_pad_token_for_loss else tokenizer.pad_token_id
--- a/examples/pytorch/translation/run_translation_no_trainer.py
+++ b/examples/pytorch/translation/run_translation_no_trainer.py
@@ -264,7 +264,7 @@ def main():

    # Make one log on every process with the configuration for debugging.
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
    )
--- a/examples/research_projects/adversarial/run_hans.py
+++ b/examples/research_projects/adversarial/run_hans.py
@@ -115,7 +115,7 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
    )
--- a/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
+++ b/examples/research_projects/bert-loses-patience/run_glue_with_pabee.py
@@ -621,7 +621,7 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
--- a/examples/research_projects/deebert/run_glue_deebert.py
+++ b/examples/research_projects/deebert/run_glue_deebert.py
@@ -571,7 +571,7 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
--- a/examples/research_projects/distillation/run_squad_w_distillation.py
+++ b/examples/research_projects/distillation/run_squad_w_distillation.py
@@ -734,7 +734,7 @@ def main():

    # Setup logging
    logging.basicConfig(
-        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN,
    )
--- a/examples/research_projects/distillation/scripts/binarized_data.py
+++ b/examples/research_projects/distillation/scripts/binarized_data.py
@@ -27,7 +27,7 @@ from transformers import BertTokenizer, GPT2Tokenizer, RobertaTokenizer


 logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
 )
 logger = logging.getLogger(__name__)

--- a/examples/research_projects/distillation/scripts/token_counts.py
+++ b/examples/research_projects/distillation/scripts/token_counts.py
@@ -22,7 +22,7 @@ from collections import Counter


 logging.basicConfig(
-    format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
+    format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO
 )
 logger = logging.getLogger(__name__)

--- a/examples/research_projects/jax-projects/README.md
+++ b/examples/research_projects/jax-projects/README.md
--- a/Show More
+++ b/Show More