Release: v4.1.1

Fix TAPAS doc
Put all models in the constants (#9170 )
2020-12-17 11:25:49 -05:00 · 2020-12-17 11:25:05 -05:00 · 2020-12-17 11:23:21 -05:00 · 2020-12-17 10:16:07 -05:00 · 2020-12-17 10:04:55 -05:00 · 2020-12-17 09:47:19 -05:00
1570 changed files with 56819 additions and 58644 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -3,6 +3,7 @@ orbs:
    gcp-gke: circleci/gcp-gke@1.0.4
    go: circleci/go@1.3.0

+
 # TPU REFERENCES
 references:
    checkout_ml_testing: &checkout_ml_testing
@@ -77,7 +78,8 @@ jobs:
                      - v0.4-torch_and_tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,torch,testing]
+            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
            - save_cache:
                key: v0.4-{{ checksum "setup.py" }}
                paths:
@@ -103,7 +105,8 @@ jobs:
                      - v0.4-torch-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing]
+            - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
@@ -129,7 +132,7 @@ jobs:
                      - v0.4-tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,testing]
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
            - save_cache:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
@@ -155,7 +158,7 @@ jobs:
                    - v0.4-flax-{{ checksum "setup.py" }}
                    - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: sudo pip install .[flax,sklearn,torch,testing]
+            - run: sudo pip install .[flax,sklearn,torch,testing,sentencepiece]
            - save_cache:
                  key: v0.4-flax-{{ checksum "setup.py" }}
                  paths:
@@ -181,7 +184,8 @@ jobs:
                      - v0.4-torch-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing]
+            - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
            - save_cache:
                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
@@ -207,7 +211,7 @@ jobs:
                      - v0.4-tf-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,tf-cpu,testing]
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
            - save_cache:
                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
@@ -221,7 +225,7 @@ jobs:
    run_tests_custom_tokenizers:
        working_directory: ~/transformers
        docker:
-            - image: circleci/python:3.6
+            - image: circleci/python:3.7
        environment:
            RUN_CUSTOM_TOKENIZERS: yes
        steps:
@@ -231,7 +235,7 @@ jobs:
                      - v0.4-custom_tokenizers-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[ja,testing]
+            - run: pip install .[ja,testing,sentencepiece]
            - run: python -m unidic download
            - save_cache:
                  key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
@@ -258,8 +262,8 @@ jobs:
                      - v0.4-torch_examples-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing]
-            - run: pip install -r examples/requirements.txt
+            - run: pip install .[sklearn,torch,sentencepiece,testing]
+            - run: pip install -r examples/_tests_requirements.txt
            - save_cache:
                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
                  paths:
@@ -270,6 +274,22 @@ jobs:
            - store_artifacts:
                  path: ~/transformers/reports

+    run_tests_git_lfs:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - run: sudo apt-get install git-lfs
+            - run: |
+                git config --global user.email "ci@dummy.com"
+                git config --global user.name "ci"
+            - run: pip install --upgrade pip
+            - run: pip install .[testing]
+            - run: RUN_GIT_LFS_TESTS=1 python -m pytest -sv ./tests/test_hf_api.py -k "HfLargefilesTest"
+
    build_doc:
        working_directory: ~/transformers
        docker:
@@ -281,7 +301,7 @@ jobs:
                      - v0.4-build_doc-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[tf,torch,sentencepiece,docs]
+            - run: pip install ."[all, docs]"
            - save_cache:
                  key: v0.4-build_doc-{{ checksum "setup.py" }}
                  paths:
@@ -303,7 +323,7 @@ jobs:
                  keys:
                      - v0.4-deploy_doc-{{ checksum "setup.py" }}
                      - v0.4-{{ checksum "setup.py" }}
-            - run: pip install .[tf,torch,sentencepiece,docs]
+            - run: pip install ."[all,docs]"
            - save_cache:
                  key: v0.4-deploy_doc-{{ checksum "setup.py" }}
                  paths:
@@ -324,7 +344,7 @@ jobs:
                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
            - run: pip install isort
-            - run: pip install .[tf,torch,flax,quality]
+            - run: pip install .[all,quality]
            - save_cache:
                  key: v0.4-code_quality-{{ checksum "setup.py" }}
                  paths:
@@ -334,6 +354,7 @@ jobs:
            - run: flake8 examples tests src utils
            - run: python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
            - run: python utils/check_copies.py
+            - run: python utils/check_table.py
            - run: python utils/check_dummies.py
            - run: python utils/check_repo.py

@@ -397,6 +418,7 @@ workflows:
            - run_tests_flax
            - run_tests_pipelines_torch
            - run_tests_pipelines_tf
+            - run_tests_git_lfs
            - build_doc
            - deploy_doc: *workflow_filters
    tpu_testing_jobs:
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -51,4 +51,7 @@ deploy_doc "7fb8bdf" v3.0.2
 deploy_doc "4b3ee9c" v3.1.0
 deploy_doc "3ebb1b3" v3.2.0
 deploy_doc "0613f05" v3.3.1
-deploy_doc "eb0e0ce" # v3.4.0 Latest stable release
+deploy_doc "eb0e0ce" v3.4.0
+deploy_doc "818878d" v3.5.1
+deploy_doc "c781171" v4.0.0
+deploy_doc "f5438ab" # v4.1.0 Latest stable release
--- a/.github/ISSUE_TEMPLATE/question-help.md
+++ b/.github/ISSUE_TEMPLATE/question-help.md
@@ -1,6 +1,6 @@
 ---
 name: "❓ Questions & Help"
-about: Post your general questions on the Hugging Face forum or Stack Overflow tagged huggingface-transformers
+about: Post your general questions on the Hugging Face forum: https://discuss.huggingface.co/
 title: ''
 labels: ''
 assignees: ''
@@ -10,18 +10,17 @@ assignees: ''
 # ❓ Questions & Help

 <!-- The GitHub issue tracker is primarly intended for bugs, feature requests,
-     new models and benchmarks, and migration questions. For all other questions,
+     new models, benchmarks, and migration questions. For all other questions,
     we direct you to the Hugging Face forum: https://discuss.huggingface.co/ .
-     You can also try Stack Overflow (SO) where a whole community of PyTorch and
-     Tensorflow enthusiast can help you out. In this case, make sure to tag your
-     question with the right deep learning framework as well as the
-     huggingface-transformers tag: 
-     https://stackoverflow.com/questions/tagged/huggingface-transformers 
     -->

 ## Details
+
 <!-- Description of your issue -->

-<!-- You should first ask your question on the forum or SO, and only if
-     you didn't get an answer ask it here on GitHub. -->
-**A link to original question on the forum/Stack Overflow**:
+<!-- You should first ask your question on the forum, and only if
+     you didn't get an answer after a few days ask it here on GitHub. -->
+
+**A link to original question on the forum**:
+
+<!-- Your issue will be closed if you don't fill this part. -->
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -20,7 +20,7 @@ Fixes # (issue)
 - [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
 - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
-      to the it if that's the case.
+      to it if that's the case.
 - [ ] Did you make sure to update the documentation with your changes? Here are the
      [documentation guidelines](https://github.com/huggingface/transformers/tree/master/docs), and
      [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/master/docs#writing-source-documentation).
@@ -58,5 +58,5 @@ members/contributors which may be interested in your PR.
 tensorflow: @jplu
 examples/token-classification: @stefan-it
 documentation: @sgugger
- FSTM: @stas00
+ FSMT: @stas00
 -->
--- a/.github/conda/build.sh
+++ b/.github/conda/build.sh
@@ -0,0 +1 @@
+$PYTHON setup.py install     # Python command to install the script.
--- a/.github/conda/meta.yaml
+++ b/.github/conda/meta.yaml
@@ -0,0 +1,48 @@
+{% set name = "transformers" %}
+
+package:
+  name: "{{ name|lower }}"
+  version: "{{ TRANSFORMERS_VERSION }}"
+
+source:
+  path: ../../
+
+build:
+  noarch: python
+
+requirements:
+  host:
+    - python
+    - pip
+    - numpy
+    - dataclasses
+    - packaging
+    - filelock
+    - requests
+    - tqdm >=4.27
+    - sacremoses
+    - regex !=2019.12.17
+    - protobuf
+    - tokenizers ==0.9.4
+  run:
+    - python
+    - numpy
+    - dataclasses
+    - packaging
+    - filelock
+    - requests
+    - tqdm >=4.27
+    - sacremoses
+    - regex !=2019.12.17
+    - protobuf
+    - tokenizers ==0.9.4
+
+test:
+  imports:
+    - transformers
+
+about:
+  home: https://huggingface.co
+  license: Apache License 2.0
+  license_file: LICENSE
+  summary: "🤗Transformers: State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0."
--- a/.github/workflows/github-torch-hub.yml
+++ b/.github/workflows/github-torch-hub.yml
@@ -8,6 +8,9 @@ on:
 jobs:
  torch_hub_integration:
    runs-on: ubuntu-latest
+    env:
+      # TODO quickfix but may need more investigation
+      ACTIONS_ALLOW_UNSECURE_COMMANDS: True
    steps:
    # no checkout necessary here.
    - name: Extract branch name
--- a/.github/workflows/model-templates.yml
+++ b/.github/workflows/model-templates.yml
@@ -0,0 +1,65 @@
+name: Model templates runner
+
+on:
+  push:
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+
+jobs:
+  run_tests_templates:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v1
+
+      - name: Install Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.6
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: ~/.cache/pip
+          key: v1.2-tests_templates
+          restore-keys: |
+            v1.2-tests_templates-${{ hashFiles('setup.py') }}
+            v1.2-tests_templates
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .[dev]
+      - name: Create model files
+        run: |
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          make style
+          python utils/check_table.py --fix_and_overwrite
+          python utils/check_dummies.py --fix_and_overwrite
+
+      - name: Run all non-slow tests
+        run: |
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_templates tests/*template*
+
+      - name: Run style changes
+        run: |
+          git fetch origin master:master
+          make fixup
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_templates_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_templates_test_reports
+          path: reports
--- a/.github/workflows/release-conda.yml
+++ b/.github/workflows/release-conda.yml
@@ -0,0 +1,43 @@
+name: Release - Conda
+
+on:
+  push:
+    tags:
+      - v*
+
+env:
+  ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+
+jobs:
+  build_and_package:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v1
+
+      - name: Install miniconda
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          auto-activate-base: false
+          activate-environment: "build-transformers"
+          channels: huggingface
+
+      - name: Setup conda env
+        run: |
+          conda install -c defaults anaconda-client conda-build
+
+      - name: Extract version
+        run: echo "TRANSFORMERS_VERSION=`python setup.py --version`" >> $GITHUB_ENV
+
+      - name: Build conda packages
+        run: |
+          conda info
+          conda build .github/conda
+
+      - name: Upload to Anaconda
+        run: anaconda upload `conda build .github/conda --output` --force
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -4,17 +4,19 @@ on:
  push:
    branches:
      - master
+      - ci_*
    paths:
      - "src/**"
      - "tests/**"
      - ".github/**"
+      - "templates/**"
  # pull_request:
  repository_dispatch:


 jobs:
  run_tests_torch_gpu:
-    runs-on: [self-hosted, single-gpu]
+    runs-on: [self-hosted, gpu, single-gpu]
    steps:
      - uses: actions/checkout@v2
      - name: Python version
@@ -46,8 +48,9 @@ jobs:
        run: |
          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[torch,sklearn,testing,onnxruntime]
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
          pip install git+https://github.com/huggingface/datasets
+          pip install pandas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -55,6 +58,14 @@ jobs:
          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"

+#      - name: Create model files
+#        run: |
+#          source .env/bin/activate
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+
      - name: Run all non-slow tests on GPU
        env:
          OMP_NUM_THREADS: 1
@@ -76,7 +87,7 @@ jobs:
                  

  run_tests_tf_gpu:
-    runs-on: [self-hosted, single-gpu]
+    runs-on: [self-hosted, gpu, single-gpu]
    steps:
      - uses: actions/checkout@v2
      - name: Python version
@@ -107,7 +118,7 @@ jobs:
        run: |
          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[tf,sklearn,testing,onnxruntime]
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
          pip install git+https://github.com/huggingface/datasets

      - name: Are GPUs recognized by our DL frameworks
@@ -116,6 +127,14 @@ jobs:
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"

+      - name: Create model files
+        run: |
+          source .env/bin/activate
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+
      - name: Run all non-slow tests on GPU
        env:
          OMP_NUM_THREADS: 1
@@ -135,8 +154,8 @@ jobs:
          name: run_all_tests_tf_gpu_test_reports
          path: reports

-  run_tests_torch_multiple_gpu:
-    runs-on: [self-hosted, multi-gpu]
+  run_tests_torch_multi_gpu:
+    runs-on: [self-hosted, gpu, multi-gpu]
    steps:
      - uses: actions/checkout@v2
      - name: Python version
@@ -154,7 +173,7 @@ jobs:
        id: cache
        with:
          path: .env
-          key: v1.1-tests_torch_multiple_gpu-${{ hashFiles('setup.py') }}
+          key: v1.1-tests_torch_multi_gpu-${{ hashFiles('setup.py') }}

      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
        run: |
@@ -167,8 +186,9 @@ jobs:
        run: |
          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[torch,sklearn,testing,onnxruntime]
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
          pip install git+https://github.com/huggingface/datasets
+          pip install pandas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html

      - name: Are GPUs recognized by our DL frameworks
        run: |
@@ -181,11 +201,11 @@ jobs:
          OMP_NUM_THREADS: 1
        run: |
          source .env/bin/activate
-          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_multiple_gpu tests
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/tests_torch_multiple_gpu_failures_short.txt          
+        run: cat reports/tests_torch_multi_gpu_failures_short.txt          

      - name: Test suite reports artifacts
        if: ${{ always() }}
@@ -194,8 +214,8 @@ jobs:
          name: run_all_tests_torch_multi_gpu_test_reports
          path: reports

-  run_tests_tf_multiple_gpu:
-    runs-on: [self-hosted, multi-gpu]
+  run_tests_tf_multi_gpu:
+    runs-on: [self-hosted, gpu, multi-gpu]
    steps:
      - uses: actions/checkout@v2
      - name: Python version
@@ -213,7 +233,7 @@ jobs:
        id: cache
        with:
          path: .env
-          key: v1.1-tests_tf_multiple_gpu-${{ hashFiles('setup.py') }}
+          key: v1.1-tests_tf_multi_gpu-${{ hashFiles('setup.py') }}

      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
        run: |
@@ -226,7 +246,7 @@ jobs:
        run: |
          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[tf,sklearn,testing,onnxruntime]
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
          pip install git+https://github.com/huggingface/datasets

      - name: Are GPUs recognized by our DL frameworks
@@ -240,11 +260,11 @@ jobs:
          OMP_NUM_THREADS: 1
        run: |
          source .env/bin/activate
-          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_multiple_gpu tests
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/tests_tf_multiple_gpu_failures_short.txt
+        run: cat reports/tests_tf_multi_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -6,16 +6,13 @@
 name: Self-hosted runner (scheduled)

 on:
-  push:
-    branches:
-      - ci_*
  repository_dispatch:
  schedule:
    - cron: "0 0 * * *"

 jobs:
  run_all_tests_torch_gpu:
-    runs-on: [self-hosted, single-gpu]
+    runs-on: [self-hosted, gpu, single-gpu]
    steps:
      - uses: actions/checkout@v2

@@ -49,7 +46,7 @@ jobs:
        run: |
          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[torch,sklearn,testing,onnxruntime]
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
          pip install git+https://github.com/huggingface/datasets
          pip list

@@ -109,7 +106,7 @@ jobs:


  run_all_tests_tf_gpu:
-    runs-on: [self-hosted, single-gpu]
+    runs-on: [self-hosted, gpu, single-gpu]
    steps:
      - uses: actions/checkout@v2

@@ -143,7 +140,7 @@ jobs:
        run: |
          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[tf,sklearn,testing,onnxruntime]
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
          pip install git+https://github.com/huggingface/datasets
          pip list

@@ -187,8 +184,8 @@ jobs:
          name: run_all_tests_tf_gpu_test_reports
          path: reports
          
-  run_all_tests_torch_multiple_gpu:
-    runs-on: [self-hosted, multi-gpu]
+  run_all_tests_torch_multi_gpu:
+    runs-on: [self-hosted, gpu, multi-gpu]
    steps:
      - uses: actions/checkout@v2

@@ -222,7 +219,7 @@ jobs:
        run: |
          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[torch,sklearn,testing,onnxruntime]
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
          pip install git+https://github.com/huggingface/datasets
          pip list

@@ -238,11 +235,11 @@ jobs:
          RUN_SLOW: yes
        run: |
          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multiple_gpu tests
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/tests_torch_multiple_gpu_failures_short.txt
+        run: cat reports/tests_torch_multi_gpu_failures_short.txt

      - name: Run examples tests on multi-GPU
        env:
@@ -250,11 +247,11 @@ jobs:
          RUN_SLOW: yes
        run: |
          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_multiple_gpu examples
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_examples_multi_gpu examples

      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/examples_torch_multiple_gpu_failures_short.txt
+        run: cat reports/tests_torch_examples_multi_gpu_failures_short.txt

      - name: Run all pipeline tests on multi-GPU
        if: ${{ always() }}
@@ -265,11 +262,11 @@ jobs:
          RUN_PIPELINE_TESTS: yes
        run: |
          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multiple_gpu tests
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/tests_torch_pipeline_multiple_gpu_failures_short.txt
+        run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
@@ -278,8 +275,8 @@ jobs:
          name: run_all_tests_torch_multi_gpu_test_reports
          path: reports

-  run_all_tests_tf_multiple_gpu:
-    runs-on: [self-hosted, multi-gpu]
+  run_all_tests_tf_multi_gpu:
+    runs-on: [self-hosted, gpu, multi-gpu]
    steps:
      - uses: actions/checkout@v2

@@ -313,7 +310,7 @@ jobs:
        run: |
          source .env/bin/activate
          pip install --upgrade pip
-          pip install .[tf,sklearn,testing,onnxruntime]
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
          pip install git+https://github.com/huggingface/datasets
          pip list

@@ -329,11 +326,11 @@ jobs:
          RUN_SLOW: yes
        run: |
          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multiple_gpu tests
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests

      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/tests_tf_multiple_gpu_failures_short.txt
+        run: cat reports/tests_tf_multi_gpu_failures_short.txt

      - name: Run all pipeline tests on multi-GPU
        if: ${{ always() }}
@@ -344,11 +341,11 @@ jobs:
          RUN_PIPELINE_TESTS: yes
        run: |
          source .env/bin/activate
-          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_multiple_gpu tests
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
          
      - name: Failure short reports
        if: ${{ always() }}
-        run: cat reports/tests_tf_multiple_gpu_pipelines_failures_short.txt
+        run: cat reports/tests_tf_pipeline_multi_gpu_failures_short.txt

      - name: Test suite reports artifacts
        if: ${{ always() }}
--- a/.gitignore
+++ b/.gitignore
@@ -133,7 +133,6 @@ dmypy.json
 tensorflow_code

 # Models
-models
 proc_data

 # examples
@@ -160,3 +159,6 @@ tags

 # pre-commit
 .pre-commit*
+
+# .lock
+*.lock
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,3 +1,19 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 # How to contribute to transformers?

 Everyone is welcome to contribute, and we value everybody's contribution. Code
@@ -125,7 +141,7 @@ Follow these steps to start contributing:
   $ git checkout -b a-descriptive-name-for-my-changes
   ```

-   **do not** work on the `master` branch.
+   **Do not** work on the `master` branch.

 4. Set up a development environment by running the following command in a virtual environment:

@@ -308,3 +324,25 @@ Check our [documentation writing guide](https://github.com/huggingface/transform
 for more information.

 #### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md)
+
+
+### Develop on Windows
+
+One way one can run the make command on Window is to pass by MSYS2:
+
+1. [Download MSYS2](https://www.msys2.org/), we assume to have it installed in C:\msys64
+2. Open the command line C:\msys64\msys2.exe (it should be available from the start menu)
+3. Run in the shell: `pacman -Syu` and install make with `pacman -S make`
+
+### Syncing forked master with upstream (HuggingFace) master
+
+To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnessary notifications to the developers involved in these PRs, 
+when syncing the master branch of a forked repository, please, follow these steps:
+1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead merge directly into the forked master.
+2. If a PR is absolutely necessary, use the following steps after checking out your branch:
+```
+$ git checkout -b your-branch-for-syncing
+$ git pull --squash --no-commit upstream master
+$ git commit -m '<your message without GitHub references>'
+$ git push --set-upstream origin your-branch-for-syncing
+```
--- a/1
+++ b/1
@@ -1,3 +1,4 @@
+Copyright 2018- The Hugging Face team. All rights reserved.

                                 Apache License
                           Version 2.0, January 2004
--- a/13
+++ b/13
@@ -1,4 +1,4 @@
-.PHONY: modified_only_fixup extra_quality_checks quality style fixup fix-copies test test-examples docs
+.PHONY: deps_table_update modified_only_fixup extra_quality_checks quality style fixup fix-copies test test-examples docs


 check_dirs := examples tests src utils
@@ -14,10 +14,16 @@ modified_only_fixup:
 		echo "No library .py files were modified"; \
 	fi

+# Update src/transformers/dependency_versions_table.py
+
+deps_table_update:
+	@python setup.py deps_table_update
+
 # Check that source code meets quality standards

-extra_quality_checks:
+extra_quality_checks: deps_table_update
 	python utils/check_copies.py
+	python utils/check_table.py
 	python utils/check_dummies.py
 	python utils/check_repo.py
 	python utils/style_doc.py src/transformers docs/source --max_len 119
@@ -32,7 +38,7 @@ quality:

 # Format source code automatically and check is there are any problems left that need manual fixing

-style:
+style: deps_table_update
 	black $(check_dirs)
 	isort $(check_dirs)
 	python utils/style_doc.py src/transformers docs/source --max_len 119
@@ -45,6 +51,7 @@ fixup: modified_only_fixup extra_quality_checks

 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
+	python utils/check_table.py --fix_and_overwrite
 	python utils/check_dummies.py --fix_and_overwrite

 # Run tests for the library
--- a/README.md
+++ b/README.md
@@ -1,3 +1,19 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 <p align="center">
    <br>
    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
@@ -31,9 +47,6 @@

 🤗 Transformers is backed by the two most popular deep learning libraries, [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/), with a seamless integration between them, allowing you to train your models with one then load it for inference with the other.

-### Recent contributors
-[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/0)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/0)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/1)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/1)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/2)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/2)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/3)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/3)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/4)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/4)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/5)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/5)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/6)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/6)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/7)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/7)
-
 ## Online demos

 You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer an [inference API](https://huggingface.co/pricing) to use those models.
@@ -137,14 +150,16 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta

 ## Installation

+### With pip
+
 This repository is tested on Python 3.6+, PyTorch 1.0.0+ (PyTorch 1.3.1+ for [examples](https://github.com/huggingface/transformers/tree/master/examples)) and TensorFlow 2.0.

 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).

 First, create a virtual environment with the version of Python you're going to use and activate it.

-Then, you will need to install one of, or both, TensorFlow 2.0 and PyTorch.
-Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
+Then, you will need to install at least one of TensorFlow 2.0, PyTorch or Flax.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform and/or [Flax installation page](https://github.com/google/flax#quick-install).

 When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:

@@ -154,12 +169,29 @@ pip install transformers

 If you'd like to play with the examples, you must [install the library from source](https://huggingface.co/transformers/installation.html#installing-from-source).

+### With conda
+
+Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
+
+🤗 Transformers can be installed using conda as follows:
+
+```shell script
+conda install -c huggingface transformers
+```
+
+Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda. 
+
 ## Models architectures

+**[All the model checkpoints](https://huggingface.co/models)** provided by 🤗 Transformers are seamlessly integrated from the huggingface.co [model hub](https://huggingface.co) where they are uploaded directly by [users](https://huggingface.co/users) and [organizations](https://huggingface.co/organizations).
+
+Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
 🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/transformers/model_summary.html) for a high-level summary of each them):

 1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
 1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
 1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
@@ -181,6 +213,8 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
 1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
 1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
 1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
@@ -188,14 +222,16 @@ Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
 ultilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
 1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
 1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[TAPAS](https://huggingface.co/transformers/master/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
 1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
 1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-1. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
 1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.

+To check if each model has an implementation in PyTorch/TensorFlow/Flax or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/transformers/index.html#bigtable)
+
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations. You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).


@@ -213,13 +249,17 @@ These implementations have been tested on several datasets (see the example scri

 ## Citation

-We now have a [paper](https://arxiv.org/abs/1910.03771) you can cite for the 🤗 Transformers library:
+We now have a [paper](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) you can cite for the 🤗 Transformers library:
 ```bibtex
-@article{Wolf2019HuggingFacesTS,
-  title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
-  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush},
-  journal={ArXiv},
-  year={2019},
-  volume={abs/1910.03771}
+@inproceedings{wolf-etal-2020-transformers,
+    title = "Transformers: State-of-the-Art Natural Language Processing",
+    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = oct,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
+    pages = "38--45"
 }
 ```
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,3 +1,19 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 # Generating the documentation

 To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -2,6 +2,15 @@

 /* Colab dropdown */

+table.center-aligned-table td {
+    text-align: center;
+}
+
+table.center-aligned-table th {
+    text-align: center;
+    vertical-align: middle;
+}
+
 .colab-dropdown {
    position: relative;
    display: inline-block;
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,13 +1,16 @@
 // These two things need to be updated at each release for the version selector.
 // Last stable version
-const stableVersion = "v3.4.0"
-// Dictionary doc folder to label
+const stableVersion = "v4.1.0"
+// Dictionary doc folder to label. The last stable version should have an empty key.
 const versionMapping = {
    "master": "master",
-    "": "v3.4.0",
+    "": "v4.1.0 (stable)",
+    "v4.0.1": "v4.0.0/v4.0.1",
+    "v3.5.1": "v3.5.0/v3.5.1",
+    "v3.4.0": "v3.4.0",
    "v3.3.1": "v3.3.0/v3.3.1",
    "v3.2.0": "v3.2.0",
-    "v3.1.0": "v3.1.0 (stable)",
+    "v3.1.0": "v3.1.0",
    "v3.0.2": "v3.0.0/v3.0.1/v3.0.2",
    "v2.11.0": "v2.11.0",
    "v2.10.0": "v2.10.0",
--- a/docs/source/benchmarks.rst
+++ b/docs/source/benchmarks.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Benchmarks
 =======================================================================================================================

--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 BERTology
 -----------------------------------------------------------------------------------------------------------------------

@@ -22,5 +34,5 @@ help people access the inner representations, mainly adapted from the great work
  in https://arxiv.org/abs/1905.10650.

 To help you understand and use these features, we have added a specific example script: `bertology.py
-<https://github.com/huggingface/transformers/blob/master/examples/bertology/run_bertology.py>`_ while extract
-information and prune a model pre-trained on GLUE.
+<https://github.com/huggingface/transformers/blob/master/examples/research_projects/bertology/run_bertology.py>`_ while
+extract information and prune a model pre-trained on GLUE.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -20,13 +20,13 @@ sys.path.insert(0, os.path.abspath('../../src'))
 # -- Project information -----------------------------------------------------

 project = u'transformers'
-copyright = u'2020, huggingface'
+copyright = u'2020, The Hugging Face Team, Licenced under the Apache License, Version 2.0'
 author = u'huggingface'

 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'3.5.0'
+release = u'4.1.1'


 # -- General configuration ---------------------------------------------------
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Converting Tensorflow Checkpoints
 =======================================================================================================================

--- a/docs/source/custom_datasets.rst
+++ b/docs/source/custom_datasets.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Fine-tuning with custom datasets
 =======================================================================================================================

--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Glossary
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -22,6 +22,18 @@ State-of-the-art NLP for everyone:
 - Hands-on practitioners
 - AI/ML/NLP teachers and educators

+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Lower compute costs, smaller carbon footprint:

 - Researchers can share trained models instead of always retraining
@@ -35,6 +47,16 @@ Choose the right framework for every part of a model's lifetime:
 - Move a single model between TF2.0/PyTorch frameworks at will
 - Seamlessly pick the right framework for training, evaluation, production

+Experimental support for Flax with a few models right now, expected to grow in the coming months.
+
+`All the model checkpoints <https://huggingface.co/models>`__ are seamlessly integrated from the huggingface.co `model
+hub <https://huggingface.co>`__ where they are uploaded directly by `users <https://huggingface.co/users>`__ and
+`organizations <https://huggingface.co/organizations>`__.
+
+Current number of checkpoints: |checkpoints|
+
+.. |checkpoints| image:: https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen
+
 Contents
 -----------------------------------------------------------------------------------------------------------------------

@@ -44,7 +66,7 @@ The documentation is organized in five parts:
  and a glossary.
 - **USING 🤗 TRANSFORMERS** contains general tutorials on how to use the library.
 - **ADVANCED GUIDES** contains more advanced guides that are more specific to a given script or part of the library.
- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general resarch in
+- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general research in
  transformers model
 - The three last section contain the documentation of each public class and function, grouped in:

@@ -52,8 +74,8 @@ The documentation is organized in five parts:
    - **MODELS** for the classes and functions related to each model implemented in the library.
    - **INTERNAL HELPERS** for the classes and functions we use internally.

-The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and
-conversion utilities for the following models:
+The library currently contains PyTorch, Tensorflow and Flax implementations, pretrained model weights, usage scripts
+and conversion utilities for the following models:

 ..
    This list is updated automatically from the README with `make fix-copies`. Do not update manually!
@@ -66,102 +88,206 @@ conversion utilities for the following models:
   Pre-training for Natural Language Generation, Translation, and Comprehension
   <https://arxiv.org/pdf/1910.13461.pdf>`__ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman
   Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-3. :doc:`BERT <model_doc/bert>` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional
+3. :doc:`BARThez <model_doc/barthez>` (from École polytechnique) released with the paper `BARThez: a Skilled Pretrained
+   French Sequence-to-Sequence Model <https://arxiv.org/abs/2010.12321>`__ by Moussa Kamal Eddine, Antoine J.-P.
+   Tixier, Michalis Vazirgiannis.
+4. :doc:`BERT <model_doc/bert>` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional
   Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__ by Jacob Devlin, Ming-Wei Chang,
   Kenton Lee and Kristina Toutanova.
-4. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
+5. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
   Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
   Narayan, Aliaksei Severyn.
-5. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
+6. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
-6. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
+7. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
   French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
   Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-7. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
+8. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
   Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
   Lav R. Varshney, Caiming Xiong and Richard Socher.
-8. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft Research) released with the paper `DeBERTa: Decoding-enhanced
+9. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft Research) released with the paper `DeBERTa: Decoding-enhanced
   BERT with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
   Weizhu Chen.
-9. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
-   Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe Zhang,
-   Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-10. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+10. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+    Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
+    Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+11. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
    distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
    Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
    `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
    version of DistilBERT.
-11. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+12. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
    Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
    Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-12. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+13. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
    Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
    Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-13. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+14. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
    Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
    Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-14. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+15. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
    Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
    Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-15. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+16. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
    Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
    and Ilya Sutskever.
-16. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+17. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
    Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
    Luan, Dario Amodei** and Ilya Sutskever**.
-17. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+18. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-18. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+19. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-19. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+20. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
    by Hao Tan and Mohit Bansal.
-20. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+21. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
    Translator Team.
-21. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+22. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-22. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+23. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
+    Jianfeng Lu, Tie-Yan Liu.
+24. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
+    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+25. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
    Mohammad Saleh and Peter J. Liu.
-23. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+26. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-24. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+27. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-25. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+28. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. ultilingual BERT into `DistilmBERT
    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German version of
    DistilBERT.
-26. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
+29. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
    about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
    Krishna, and Kurt W. Keutzer.
-27. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+30. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-28. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+31. `TAPAS <https://huggingface.co/transformers/master/model_doc/tapas.html>`__ (from Google AI) released with the
+    paper `TAPAS: Weakly Supervised Table Parsing via Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan
+    Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+32. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-29. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+33. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
-30. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+34. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
-31. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+35. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
    Zettlemoyer and Veselin Stoyanov.
-32. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+36. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-33. `Other community models <https://huggingface.co/models>`__, contributed by the `community
-    <https://huggingface.co/users>`__.
+
+
+.. _bigtable:
+
+The table below represents the current support in the library for each of those models, whether they have a Python
+tokenizer (called "slow"). A "fast" tokenizer backed by the 🤗 Tokenizers library, whether they have support in PyTorch,
+TensorFlow and/or Flax.
+
+..
+    This table is updated automatically from the auto modules with `make fix-copies`. Do not update manually!
+
+.. rst-class:: center-aligned-table
+
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+=============================+================+================+=================+====================+==============+
+|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           DeBERTa           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             RAG             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            TAPAS            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        XLMProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+

 .. toctree::
    :maxdepth: 2
@@ -228,6 +354,7 @@ conversion utilities for the following models:
    model_doc/albert
    model_doc/auto
    model_doc/bart
+    model_doc/barthez
    model_doc/bert
    model_doc/bertgeneration
    model_doc/blenderbot
@@ -248,6 +375,8 @@ conversion utilities for the following models:
    model_doc/marian
    model_doc/mbart
    model_doc/mobilebert
+    model_doc/mpnet
+    model_doc/mt5
    model_doc/gpt
    model_doc/gpt2
    model_doc/pegasus
@@ -258,6 +387,7 @@ conversion utilities for the following models:
    model_doc/roberta
    model_doc/squeezebert
    model_doc/t5
+    model_doc/tapas
    model_doc/transformerxl
    model_doc/xlm
    model_doc/xlmprophetnet
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -1,3 +1,19 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 # Installation

 🤗 Transformers is tested on Python 3.6+, and PyTorch 1.1.0+ or TensorFlow 2.0+.
@@ -12,9 +28,10 @@ must install it from source.
 ## Installation with pip

 First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
-Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) 
-and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific 
-install command for your platform.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available), 
+[PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or 
+[Flax installation page](https://github.com/google/flax#quick-install)
+regarding the specific install command for your platform.

 When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:

@@ -34,6 +51,12 @@ or 🤗 Transformers and TensorFlow 2.0 in one line with:
 pip install transformers[tf-cpu]
 ```

+or 🤗 Transformers and Flax in one line with:
+
+```bash
+pip install transformers[flax]
+```
+
 To check 🤗 Transformers is properly installed, run the following command:

 ```bash
@@ -66,19 +89,32 @@ python -c "from transformers import pipeline; print(pipeline('sentiment-analysis

 to check 🤗 Transformers is properly installed.

+
+## With conda
+
+Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
+
+🤗 Transformers can be installed using conda as follows:
+
+```
+conda install -c huggingface transformers
+```
+
+Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda. 
+
 ## Caching models

 This library provides pretrained models that will be downloaded and cached locally. Unless you specify a location with
 `cache_dir=...` when you use methods like `from_pretrained`, these models will automatically be downloaded in the
-folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the PyTorch
-cache home followed by ``/transformers/`` (even if you don't have PyTorch installed). This is (by order of priority):
+folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the Hugging
+Face cache home followed by ``/transformers/``. This is (by order of priority):

-  * shell environment variable ``TORCH_HOME``
-  * shell environment variable ``XDG_CACHE_HOME`` + ``/torch/``
-  * default: ``~/.cache/torch/``
+  * shell environment variable ``HF_HOME`` 
+  * shell environment variable ``XDG_CACHE_HOME`` + ``/huggingface/``
+  * default: ``~/.cache/huggingface/``

 So if you don't have any specific environment variable set, the cache directory will be at
-``~/.cache/torch/transformers/``.
+``~/.cache/huggingface/transformers/``.

 **Note:** If you have set a shell environment variable for one of the predecessors of this library
 (``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
@@ -97,6 +133,6 @@ You should check out our [swift-coreml-transformers](https://github.com/huggingf
 It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, 
 `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.

-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch or
+At some point in the future, you'll be able to seamlessly move from pretraining or fine-tuning models in PyTorch or
 TensorFlow 2.0 to productizing them in CoreML, or prototype a model or an app in CoreML then research its
 hyperparameters or architecture from PyTorch or TensorFlow 2.0. Super exciting!
--- a/docs/source/internal/generation_utils.rst
+++ b/docs/source/internal/generation_utils.rst
@@ -1,9 +1,22 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Utilities for Generation
 -----------------------------------------------------------------------------------------------------------------------

 This page lists all the utility functions used by :meth:`~transformers.PretrainedModel.generate`,
 :meth:`~transformers.PretrainedModel.greedy_search`, :meth:`~transformers.PretrainedModel.sample`,
-:meth:`~transformers.PretrainedModel.beam_search`, and :meth:`~transformers.PretrainedModel.beam_sample`.
+:meth:`~transformers.PretrainedModel.beam_search`, :meth:`~transformers.PretrainedModel.beam_sample`, and
+:meth:`~transformers.PretrainedModel.group_beam_search`.

 Most of those are only useful if you are studying the code of the generate methods in the library.

@@ -19,6 +32,9 @@ generation.
 .. autoclass:: transformers.LogitsProcessorList
    :members: __call__

+.. autoclass:: transformers.LogitsWarper
+    :members: __call__
+
 .. autoclass:: transformers.MinLengthLogitsProcessor
    :members: __call__

@@ -40,6 +56,12 @@ generation.
 .. autoclass:: transformers.NoBadWordsLogitsProcessor
    :members: __call__

+.. autoclass:: transformers.PrefixConstrainedLogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.HammingDiversityLogitsProcessor
+    :members: __call__
+
 BeamSearch
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -48,3 +70,10 @@ BeamSearch

 .. autoclass:: transformers.BeamSearchScorer
    :members: process, finalize
+
+Utilities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.top_k_top_p_filtering
+
+.. autofunction:: transformers.tf_top_k_top_p_filtering
--- a/docs/source/internal/modeling_utils.rst
+++ b/docs/source/internal/modeling_utils.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Custom Layers and Utilities
 -----------------------------------------------------------------------------------------------------------------------

@@ -79,8 +91,6 @@ TensorFlow loss functions
 TensorFlow Helper Functions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autofunction:: transformers.modeling_tf_utils.cast_bool_to_primitive
-
 .. autofunction:: transformers.modeling_tf_utils.get_initializer

 .. autofunction:: transformers.modeling_tf_utils.keras_serializable
--- a/docs/source/internal/pipelines_utils.rst
+++ b/docs/source/internal/pipelines_utils.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Utilities for pipelines
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/internal/tokenization_utils.rst
+++ b/docs/source/internal/tokenization_utils.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Utilities for Tokenizers
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/internal/trainer_utils.rst
+++ b/docs/source/internal/trainer_utils.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Utilities for Trainer
 -----------------------------------------------------------------------------------------------------------------------

@@ -10,6 +22,8 @@ Utilities

 .. autoclass:: transformers.EvalPrediction

+.. autoclass:: transformers.EvaluationStrategy
+
 .. autofunction:: transformers.set_seed

 .. autofunction:: transformers.torch_distributed_zero_first
@@ -20,8 +34,15 @@ Callbacks internals

 .. autoclass:: transformers.trainer_callback.CallbackHandler

+
 Distributed Evaluation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.trainer_pt_utils.DistributedTensorGatherer
    :members:
+
+
+Distributed Evaluation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.HfArgumentParser
--- a/docs/source/main_classes/callback.rst
+++ b/docs/source/main_classes/callback.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Callbacks
 -----------------------------------------------------------------------------------------------------------------------

@@ -44,6 +56,8 @@ Here is the list of the available :class:`~transformers.TrainerCallback` in the

 .. autoclass:: transformers.ProgressCallback

+.. autoclass:: transformers.EarlyStoppingCallback
+
 .. autoclass:: transformers.integrations.TensorBoardCallback

 .. autoclass:: transformers.integrations.WandbCallback
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Configuration
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/main_classes/logging.rst
+++ b/docs/source/main_classes/logging.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Logging
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -1,9 +1,22 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Models
 -----------------------------------------------------------------------------------------------------------------------

-The base classes :class:`~transformers.PreTrainedModel` and :class:`~transformers.TFPreTrainedModel` implement the
-common methods for loading/saving a model either from a local file or directory, or from a pretrained model
-configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
+The base classes :class:`~transformers.PreTrainedModel`, :class:`~transformers.TFPreTrainedModel`, and
+:class:`~transformers.FlaxPreTrainedModel` implement the common methods for loading/saving a model either from a local
+file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS
+S3 repository).

 :class:`~transformers.PreTrainedModel` and :class:`~transformers.TFPreTrainedModel` also implement a few methods which
 are common among all the models to:
@@ -45,6 +58,13 @@ TFModelUtilsMixin
    :members:


+FlaxPreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxPreTrainedModel
+    :members:
+
+
 Generation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Optimization
 -----------------------------------------------------------------------------------------------------------------------

@@ -62,6 +74,10 @@ Learning Rate Schedules (Pytorch)
    :target: /imgs/warmup_linear_schedule.png
    :alt:

+
+.. autofunction:: transformers.get_polynomial_decay_schedule_with_warmup
+
+
 Warmup (TensorFlow)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

--- a/docs/source/main_classes/output.rst
+++ b/docs/source/main_classes/output.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Model outputs
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Pipelines
 -----------------------------------------------------------------------------------------------------------------------

@@ -22,6 +34,7 @@ There are two categories of pipeline abstractions to be aware about:
    - :class:`~transformers.TranslationPipeline`
    - :class:`~transformers.ZeroShotClassificationPipeline`
    - :class:`~transformers.Text2TextGenerationPipeline`
+    - :class:`~transformers.TableQuestionAnsweringPipeline`

 The pipeline abstraction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -61,8 +74,9 @@ FillMaskPipeline
 NerPipeline
 =======================================================================================================================

-This class is an alias of the :class:`~transformers.TokenClassificationPipeline` defined below. Please refer to that
-pipeline for documentation and usage examples.
+.. autoclass:: transformers.NerPipeline
+
+See :class:`~transformers.TokenClassificationPipeline` for all details.

 QuestionAnsweringPipeline
 =======================================================================================================================
@@ -78,6 +92,13 @@ SummarizationPipeline
    :special-members: __call__
    :members:

+TableQuestionAnsweringPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.TableQuestionAnsweringPipeline
+    :special-members: __call__
+
+
 TextClassificationPipeline
 =======================================================================================================================

@@ -106,6 +127,13 @@ TokenClassificationPipeline
    :special-members: __call__
    :members:

+TranslationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.TranslationPipeline
+    :special-members: __call__
+    :members:
+
 ZeroShotClassificationPipeline
 =======================================================================================================================

--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Processors
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Tokenizer
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Trainer
 -----------------------------------------------------------------------------------------------------------------------

@@ -35,7 +47,7 @@ Here is an example of how to customize :class:`~transformers.Trainer` using a cu
    class MyTrainer(Trainer):
        def compute_loss(self, model, inputs):
            labels = inputs.pop("labels")
-            outputs = models(**inputs)
+            outputs = model(**inputs)
            logits = outputs[0]
            return my_custom_loss(logits, labels)

--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -1,5 +1,186 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 # Migrating from previous packages

+## Migrating from transformers `v3.x` to `v4.x`
+
+A couple of changes were introduced when the switch from version 3 to version 4 was done. Below is a summary of the
+expected changes:
+
+#### 1. AutoTokenizers and pipelines now use fast (rust) tokenizers by default.
+
+The python and rust tokenizers have roughly the same API, but the rust tokenizers have a more complete feature set. 
+
+This introduces two breaking changes:
+- The handling of overflowing tokens between the python and rust tokenizers is different.
+- The rust tokenizers do not accept integers in the encoding methods.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+- The pipelines now contain additional features out of the box. See the [token-classification pipeline with the `grouped_entities` flag](https://huggingface.co/transformers/main_classes/pipelines.html?highlight=textclassification#tokenclassificationpipeline).
+- The auto-tokenizers now return rust tokenizers. In order to obtain the python tokenizers instead, the user may use the `use_fast` flag by setting it to `False`:
+
+In version `v3.x`:
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+```
+to obtain the same in version `v4.x`:
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
+```
+
+#### 2. SentencePiece is removed from the required dependencies
+
+The requirement on the SentencePiece dependency has been lifted from the `setup.py`. This is done so that we may have a channel on anaconda cloud without relying on `conda-forge`. This means that the tokenizers that depend on the SentencePiece library will not be available with a standard `transformers` installation.
+
+This includes the **slow** versions of:
+- `XLNetTokenizer`
+- `AlbertTokenizer`
+- `CamembertTokenizer`
+- `MBartTokenizer`
+- `PegasusTokenizer`
+- `T5Tokenizer`
+- `ReformerTokenizer`
+- `XLMRobertaTokenizer`
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should install `sentencepiece` additionally:
+
+In version `v3.x`:
+```bash
+pip install transformers
+```
+to obtain the same in version `v4.x`:
+```bash
+pip install transformers[sentencepiece]
+```
+or
+```bash
+pip install transformers sentencepiece
+```
+#### 3. The architecture of the repo has been updated so that each model resides in its folder
+
+The past and foreseeable addition of new models means that the number of files in the directory `src/transformers` keeps growing and becomes harder to navigate and understand. We made the choice to put each model and the files accompanying it in their own sub-directories.
+
+This is a breaking change as importing intermediary layers using a model's module directly needs to be done via a different path.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should update the path used to access the layers. 
+
+In version `v3.x`:
+```bash
+from transformers.modeling_bert import BertLayer
+```
+to obtain the same in version `v4.x`:
+```bash
+from transformers.models.bert.modeling_bert import BertLayer
+```
+
+#### 4. Switching the `return_dict` argument to `True` by default
+
+The [`return_dict` argument](https://huggingface.co/transformers/main_classes/output.html) enables the return of dict-like python objects containing the model outputs, instead of the standard tuples. This object is self-documented as keys can be used to retrieve values, while also behaving as a tuple as users may retrieve objects by index or by slice.
+
+This is a breaking change as the limitation of that tuple is that it cannot be unpacked: `value0, value1 = outputs` will not work.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should specify the `return_dict` argument to `False`, either in the model configuration or during the forward pass.
+
+In version `v3.x`:
+```bash
+model = BertModel.from_pretrained("bert-base-cased")
+outputs = model(**inputs)
+```
+to obtain the same in version `v4.x`:
+```bash
+model = BertModel.from_pretrained("bert-base-cased")
+outputs = model(**inputs, return_dict=False)
+```
+or
+```bash
+model = BertModel.from_pretrained("bert-base-cased", return_dict=False)
+outputs = model(**inputs)
+```
+
+#### 5. Removed some deprecated attributes
+
+Attributes that were deprecated have been removed if they had been deprecated for at least a month. The full list of deprecated attributes can be found in [#8604](https://github.com/huggingface/transformers/pull/8604).
+
+Here is a list of these attributes/methods/arguments and what their replacements should be:
+
+In several models, the labels become consistent with the other models:
+- `masked_lm_labels` becomes `labels` in `AlbertForMaskedLM` and `AlbertForPreTraining`.
+- `masked_lm_labels` becomes `labels` in `BertForMaskedLM` and `BertForPreTraining`.
+- `masked_lm_labels` becomes `labels` in `DistilBertForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `ElectraForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `LongformerForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `MobileBertForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `RobertaForMaskedLM`.
+- `lm_labels` becomes `labels` in `BartForConditionalGeneration`.
+- `lm_labels` becomes `labels` in `GPT2DoubleHeadsModel`.
+- `lm_labels` becomes `labels` in `OpenAIGPTDoubleHeadsModel`.
+- `lm_labels` becomes `labels` in `T5ForConditionalGeneration`.
+
+In several models, the caching mechanism becomes consistent with the other models:
+- `decoder_cached_states` becomes `past_key_values` in all BART-like, FSMT and T5 models.
+- `decoder_past_key_values` becomes `past_key_values` in all BART-like, FSMT and T5 models.
+- `past` becomes `past_key_values` in all CTRL models.
+- `past` becomes `past_key_values` in all GPT-2 models.
+
+Regarding the tokenizer classes:
+- The tokenizer attribute `max_len` becomes `model_max_length`.
+- The tokenizer attribute `return_lengths` becomes `return_length`.
+- The tokenizer encoding argument `is_pretokenized` becomes `is_split_into_words`.
+
+Regarding the `Trainer` class:
+- The `Trainer` argument `tb_writer` is removed in favor of the callback `TensorBoardCallback(tb_writer=...)`.
+- The `Trainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
+- The `Trainer` attribute `data_collator` should be a callable.
+- The `Trainer` method `_log` is deprecated in favor of `log`.
+- The `Trainer` method `_training_step` is deprecated in favor of `training_step`.
+- The `Trainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
+- The `Trainer` method `is_local_master` is deprecated in favor of `is_local_process_zero`.
+- The `Trainer` method `is_world_master` is deprecated in favor of `is_world_process_zero`.
+
+Regarding the `TFTrainer` class:
+- The `TFTrainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
+- The `Trainer` method `_log` is deprecated in favor of `log`.
+- The `TFTrainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
+- The `TFTrainer` method `_setup_wandb` is deprecated in favor of `setup_wandb`.
+- The `TFTrainer` method `_run_model` is deprecated in favor of `run_model`.
+
+Regarding the `TrainerArgument` class:
+- The `TrainerArgument` argument `evaluate_during_training` is deprecated in favor of `evaluation_strategy`.
+
+Regarding the Transfo-XL model:
+- The Transfo-XL configuration attribute `tie_weight` becomes `tie_words_embeddings`.
+- The Transfo-XL modeling method `reset_length` becomes `reset_memory_length`.
+
+Regarding pipelines:
+- The `FillMaskPipeline` argument `topk` becomes `top_k`.
+
+
+
 ## Migrating from pytorch-transformers to 🤗 Transformers

 Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to 🤗 Transformers.
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 ALBERT
 -----------------------------------------------------------------------------------------------------------------------

@@ -48,13 +60,20 @@ AlbertTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+AlbertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertTokenizerFast
+    :members:
+
+
 Albert specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_albert.AlbertForPreTrainingOutput
+.. autoclass:: transformers.models.albert.modeling_albert.AlbertForPreTrainingOutput
    :members:

-.. autoclass:: transformers.modeling_tf_albert.TFAlbertForPreTrainingOutput
+.. autoclass:: transformers.models.albert.modeling_tf_albert.TFAlbertForPreTrainingOutput
    :members:


--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Auto Classes
 -----------------------------------------------------------------------------------------------------------------------

@@ -102,6 +114,13 @@ AutoModelForQuestionAnswering
    :members:


+AutoModelForTableQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForTableQuestionAnswering
+    :members:
+
+
 TFAutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -163,3 +182,10 @@ TFAutoModelForQuestionAnswering

 .. autoclass:: transformers.TFAutoModelForQuestionAnswering
    :members:
+
+
+FlaxAutoModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAutoModel
+    :members:
--- a/docs/source/model_doc/bart.rst
+++ b/docs/source/model_doc/bart.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 BART
 -----------------------------------------------------------------------------------------------------------------------

@@ -34,6 +46,8 @@ ________________________________________________________________________________
 - An example of how to train :class:`~transformers.BartForConditionalGeneration` with a Hugging Face :obj:`datasets`
  object can be found in this `forum discussion
  <https://discuss.huggingface.co/t/train-bart-for-conditional-generation-e-g-summarization/1904>`__.
+- `Distilled checkpoints <https://huggingface.co/models?search=distilbart>`__ are described in this `paper
+  <https://arxiv.org/abs/2010.13002>`__.


 Implementation Notes
@@ -42,16 +56,33 @@ Implementation Notes
 - Bart doesn't use :obj:`token_type_ids` for sequence classification. Use :class:`~transformers.BartTokenizer` or
  :meth:`~transformers.BartTokenizer.encode` to get the proper splitting.
 - The forward pass of :class:`~transformers.BartModel` will create decoder inputs (using the helper function
-  :func:`transformers.modeling_bart._prepare_bart_decoder_inputs`) if they are not passed. This is different than some
-  other modeling APIs.
- Model predictions are intended to be identical to the original implementation. This only works, however, if the
-  string you pass to :func:`fairseq.encode` starts with a space.
+  :func:`transformers.models.bart.modeling_bart._prepare_bart_decoder_inputs`) if they are not passed. This is
+  different than some other modeling APIs.
+- Model predictions are intended to be identical to the original implementation when
+  :obj:`force_bos_token_to_be_generated=True`. This only works, however, if the string you pass to
+  :func:`fairseq.encode` starts with a space.
 - :meth:`~transformers.BartForConditionalGeneration.generate` should be used for conditional generation tasks like
  summarization, see the example in that docstrings.
 - Models that load the `facebook/bart-large-cnn` weights will not have a :obj:`mask_token_id`, or be able to perform
  mask-filling tasks.
 - For training/forward passes that don't involve beam search, pass :obj:`use_cache=False`.

+Mask Filling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The :obj:`facebook/bart-base` and :obj:`facebook/bart-large` checkpoints can be used to fill multi-token masks.
+
+.. code-block::
+
+    from transformers import BartForConditionalGeneration, BartTokenizer
+    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", force_bos_token_to_be_generated=True)
+    tok = BartTokenizer.from_pretrained("facebook/bart-large")
+    example_english_phrase = "UN Chief Says There Is No <mask> in Syria"
+    batch = tok(example_english_phrase, return_tensors='pt')
+    generated_ids = model.generate(batch['input_ids'])
+    assert tok.batch_decode(generated_ids, skip_special_tokens=True) == ['UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria']
+
+

 BartConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -67,6 +98,12 @@ BartTokenizer
    :members:


+BartTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartTokenizerFast
+    :members:
+

 BartModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -74,8 +111,6 @@ BartModel
 .. autoclass:: transformers.BartModel
    :members: forward

-.. autofunction:: transformers.modeling_bart._prepare_bart_decoder_inputs
-

 BartForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
--- a/docs/source/model_doc/barthez.rst
+++ b/docs/source/model_doc/barthez.rst
@@ -0,0 +1,59 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BARThez
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BARThez model was proposed in `BARThez: a Skilled Pretrained French Sequence-to-Sequence Model`
+<https://arxiv.org/abs/2010.12321>`__ by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis on 23 Oct,
+2020.
+
+The abstract of the paper:
+
+
+*Inductive transfer learning, enabled by self-supervised learning, have taken the entire Natural Language Processing
+(NLP) field by storm, with models such as BERT and BART setting new state of the art on countless natural language
+understanding tasks. While there are some notable exceptions, most of the available models and research have been
+conducted for the English language. In this work, we introduce BARThez, the first BART model for the French language
+(to the best of our knowledge). BARThez was pretrained on a very large monolingual French corpus from past research
+that we adapted to suit BART's perturbation schemes. Unlike already existing BERT-based French language models such as
+CamemBERT and FlauBERT, BARThez is particularly well-suited for generative tasks, since not only its encoder but also
+its decoder is pretrained. In addition to discriminative tasks from the FLUE benchmark, we evaluate BARThez on a novel
+summarization dataset, OrangeSum, that we release with this paper. We also continue the pretraining of an already
+pretrained multilingual BART on BARThez's corpus, and we show that the resulting model, which we call mBARTHez,
+provides a significant boost over vanilla BARThez, and is on par with or outperforms CamemBERT and FlauBERT.*
+
+The Authors' code can be found `here <https://github.com/moussaKam/BARThez>`__.
+
+
+Examples
+_______________________________________________________________________________________________________________________
+
+- BARThez can be fine-tuned on sequence-to-sequence tasks in a similar way as BART, check: `examples/seq2seq/
+  <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/README.md>`__.
+
+
+BarthezTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BarthezTokenizer
+    :members:
+
+
+BarthezTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BarthezTokenizerFast
+    :members:
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 BERT
 -----------------------------------------------------------------------------------------------------------------------

@@ -57,10 +69,10 @@ BertTokenizerFast
 Bert specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_bert.BertForPreTrainingOutput
+.. autoclass:: transformers.models.bert.modeling_bert.BertForPreTrainingOutput
    :members:

-.. autoclass:: transformers.modeling_tf_bert.TFBertForPreTrainingOutput
+.. autoclass:: transformers.models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
    :members:


@@ -188,3 +200,17 @@ TFBertForQuestionAnswering

 .. autoclass:: transformers.TFBertForQuestionAnswering
    :members: call
+
+
+FlaxBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBertModel
+    :members: __call__
+
+
+FlaxBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBertForMaskedLM
+    :members: __call__
--- a/docs/source/model_doc/bertgeneration.rst
+++ b/docs/source/model_doc/bertgeneration.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 BertGeneration
 -----------------------------------------------------------------------------------------------------------------------

@@ -10,7 +22,7 @@ Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi Narayan, Ali

 The abstract from the paper is the following:

-*Unsupervised pre-training of large neural models has recently revolutionized Natural Language Processing. By
+*Unsupervised pretraining of large neural models has recently revolutionized Natural Language Processing. By
 warm-starting from the publicly released checkpoints, NLP practitioners have pushed the state-of-the-art on multiple
 benchmarks while saving significant amounts of compute time. So far the focus has been mainly on the Natural Language
 Understanding tasks. In this paper, we demonstrate the efficacy of pre-trained checkpoints for Sequence Generation. We
@@ -40,7 +52,7 @@ Usage:
  labels = tokenizer('This is a short summary', return_tensors="pt").input_ids

  # train...
-  loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels, return_dict=True).loss
+  loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
  loss.backward()


--- a/docs/source/model_doc/blenderbot.rst
+++ b/docs/source/model_doc/blenderbot.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Blenderbot
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/camembert.rst
+++ b/docs/source/model_doc/camembert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 CamemBERT
 -----------------------------------------------------------------------------------------------------------------------

@@ -42,6 +54,13 @@ CamembertTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+CamembertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertTokenizerFast
+    :members:
+
+
 CamembertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 CTRL
 -----------------------------------------------------------------------------------------------------------------------

@@ -65,6 +77,13 @@ CTRLLMHeadModel
    :members: forward


+CTRLForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CTRLForSequenceClassification
+    :members: forward
+
+
 TFCTRLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/deberta.rst
+++ b/docs/source/model_doc/deberta.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 DeBERTa
 -----------------------------------------------------------------------------------------------------------------------

@@ -20,8 +32,8 @@ disentangled attention mechanism, where each word is represented using two vecto
 position, respectively, and the attention weights among words are computed using disentangled matrices on their
 contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
 predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
-of model pre-training and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half
-of the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
+of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
+the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
 (90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
 pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*

--- a/docs/source/model_doc/dialogpt.rst
+++ b/docs/source/model_doc/dialogpt.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 DialoGPT
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 DistilBERT
 -----------------------------------------------------------------------------------------------------------------------

@@ -18,9 +30,9 @@ operating these large models in on-the-edge and/or under constrained computation
 remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
 model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
 counterparts. While most prior work investigated the use of distillation for building task-specific models, we leverage
-knowledge distillation during the pre-training phase and show that it is possible to reduce the size of a BERT model by
+knowledge distillation during the pretraining phase and show that it is possible to reduce the size of a BERT model by
 40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage the inductive
-biases learned by larger models during pre-training, we introduce a triple loss combining language modeling,
+biases learned by larger models during pretraining, we introduce a triple loss combining language modeling,
 distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train and we
 demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device
 study.*
--- a/docs/source/model_doc/dpr.rst
+++ b/docs/source/model_doc/dpr.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 DPR
 -----------------------------------------------------------------------------------------------------------------------

@@ -5,7 +17,7 @@ Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 Dense Passage Retrieval (DPR) is a set of tools and models for state-of-the-art open-domain Q&A research. It was
-intorduced in `Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`__ by
+introduced in `Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`__ by
 Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih.

 The abstract from the paper is the following:
@@ -71,13 +83,13 @@ DPRReaderTokenizerFast
 DPR specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_dpr.DPRContextEncoderOutput
+.. autoclass:: transformers.models.dpr.modeling_dpr.DPRContextEncoderOutput
    :members:

-.. autoclass:: transformers.modeling_dpr.DPRQuestionEncoderOutput
+.. autoclass:: transformers.models.dpr.modeling_dpr.DPRQuestionEncoderOutput
    :members:

-.. autoclass:: transformers.modeling_dpr.DPRReaderOutput
+.. autoclass:: transformers.models.dpr.modeling_dpr.DPRReaderOutput
    :members:


@@ -99,3 +111,22 @@ DPRReader

 .. autoclass:: transformers.DPRReader
    :members: forward
+
+TFDPRContextEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDPRContextEncoder
+    :members: call
+
+TFDPRQuestionEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDPRQuestionEncoder
+    :members: call
+
+
+TFDPRReader
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDPRReader
+    :members: call
--- a/docs/source/model_doc/electra.rst
+++ b/docs/source/model_doc/electra.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 ELECTRA
 -----------------------------------------------------------------------------------------------------------------------

@@ -12,14 +24,14 @@ identify which tokens were replaced by the generator in the sequence.

 The abstract from the paper is the following:

-*Masked language modeling (MLM) pre-training methods such as BERT corrupt the input by replacing some tokens with
-[MASK] and then train a model to reconstruct the original tokens. While they produce good results when transferred to
+*Masked language modeling (MLM) pretraining methods such as BERT corrupt the input by replacing some tokens with [MASK]
+and then train a model to reconstruct the original tokens. While they produce good results when transferred to
 downstream NLP tasks, they generally require large amounts of compute to be effective. As an alternative, we propose a
-more sample-efficient pre-training task called replaced token detection. Instead of masking the input, our approach
+more sample-efficient pretraining task called replaced token detection. Instead of masking the input, our approach
 corrupts it by replacing some tokens with plausible alternatives sampled from a small generator network. Then, instead
 of training a model that predicts the original identities of the corrupted tokens, we train a discriminative model that
 predicts whether each token in the corrupted input was replaced by a generator sample or not. Thorough experiments
-demonstrate this new pre-training task is more efficient than MLM because the task is defined over all input tokens
+demonstrate this new pretraining task is more efficient than MLM because the task is defined over all input tokens
 rather than just the small subset that was masked out. As a result, the contextual representations learned by our
 approach substantially outperform the ones learned by BERT given the same model size, data, and compute. The gains are
 particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained
@@ -69,10 +81,10 @@ ElectraTokenizerFast
 Electra specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_electra.ElectraForPreTrainingOutput
+.. autoclass:: transformers.models.electra.modeling_electra.ElectraForPreTrainingOutput
    :members:

-.. autoclass:: transformers.modeling_tf_electra.TFElectraForPreTrainingOutput
+.. autoclass:: transformers.models.electra.modeling_tf_electra.TFElectraForPreTrainingOutput
    :members:


--- a/docs/source/model_doc/encoderdecoder.rst
+++ b/docs/source/model_doc/encoderdecoder.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Encoder Decoder Models
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/flaubert.rst
+++ b/docs/source/model_doc/flaubert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 FlauBERT
 -----------------------------------------------------------------------------------------------------------------------

@@ -19,7 +31,7 @@ representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018;
 heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre for
 Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
 classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most of the
-time they outperform other pre-training approaches. Different versions of FlauBERT as well as a unified evaluation
+time they outperform other pretraining approaches. Different versions of FlauBERT as well as a unified evaluation
 protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared to the research
 community for further reproducible experiments in French NLP.*

--- a/docs/source/model_doc/fsmt.rst
+++ b/docs/source/model_doc/fsmt.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 FSMT
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/funnel.rst
+++ b/docs/source/model_doc/funnel.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Funnel Transformer
 -----------------------------------------------------------------------------------------------------------------------

@@ -65,10 +77,10 @@ FunnelTokenizerFast
 Funnel specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_funnel.FunnelForPreTrainingOutput
+.. autoclass:: transformers.models.funnel.modeling_funnel.FunnelForPreTrainingOutput
    :members:

-.. autoclass:: transformers.modeling_tf_funnel.TFFunnelForPreTrainingOutput
+.. autoclass:: transformers.models.funnel.modeling_tf_funnel.TFFunnelForPreTrainingOutput
    :members:


--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 OpenAI GPT
 -----------------------------------------------------------------------------------------------------------------------

@@ -14,7 +26,7 @@ The abstract from the paper is the following:
 *Natural language understanding comprises a wide range of diverse tasks such as textual entailment, question answering,
 semantic similarity assessment, and document classification. Although large unlabeled text corpora are abundant,
 labeled data for learning these specific tasks is scarce, making it challenging for discriminatively trained models to
-perform adequately. We demonstrate that large gains on these tasks can be realized by generative pre-training of a
+perform adequately. We demonstrate that large gains on these tasks can be realized by generative pretraining of a
 language model on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each specific task. In
 contrast to previous approaches, we make use of task-aware input transformations during fine-tuning to achieve
 effective transfer while requiring minimal changes to the model architecture. We demonstrate the effectiveness of our
@@ -72,10 +84,10 @@ OpenAIGPTTokenizerFast
 OpenAI specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_openai.OpenAIGPTDoubleHeadsModelOutput
+.. autoclass:: transformers.models.openai.modeling_openai.OpenAIGPTDoubleHeadsModelOutput
    :members:

-.. autoclass:: transformers.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput
+.. autoclass:: transformers.models.openai.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput
    :members:


@@ -126,3 +138,9 @@ TFOpenAIGPTDoubleHeadsModel

 .. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel
    :members: call
+
+TFOpenAIGPTForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFOpenAIGPTForSequenceClassification
+    :members: call
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 OpenAI GPT2
 -----------------------------------------------------------------------------------------------------------------------

@@ -60,10 +72,10 @@ GPT2TokenizerFast
 GPT2 specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_gpt2.GPT2DoubleHeadsModelOutput
+.. autoclass:: transformers.models.gpt2.modeling_gpt2.GPT2DoubleHeadsModelOutput
    :members:

-.. autoclass:: transformers.modeling_tf_gpt2.TFGPT2DoubleHeadsModelOutput
+.. autoclass:: transformers.models.gpt2.modeling_tf_gpt2.TFGPT2DoubleHeadsModelOutput
    :members:


@@ -71,14 +83,14 @@ GPT2Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2Model
-    :members: forward
+    :members: forward, parallelize, deparallelize


 GPT2LMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2LMHeadModel
-    :members: forward
+    :members: forward, parallelize, deparallelize


 GPT2DoubleHeadsModel
@@ -114,3 +126,15 @@ TFGPT2DoubleHeadsModel

 .. autoclass:: transformers.TFGPT2DoubleHeadsModel
    :members: call
+
+TFGPT2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFGPT2ForSequenceClassification
+    :members: call
+
+TFSequenceClassifierOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFSequenceClassifierOutputWithPast
+    :members:
--- a/docs/source/model_doc/layoutlm.rst
+++ b/docs/source/model_doc/layoutlm.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 LayoutLM
 -----------------------------------------------------------------------------------------------------------------------

@@ -6,19 +18,19 @@ Overview

 The LayoutLM model was proposed in the paper `LayoutLM: Pre-training of Text and Layout for Document Image
 Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and
-Ming Zhou. It's a simple but effective pre-training method of text and layout for document image understanding and
+Ming Zhou. It's a simple but effective pretraining method of text and layout for document image understanding and
 information extraction tasks, such as form understanding and receipt understanding.

 The abstract from the paper is the following:

 *Pre-training techniques have been verified successfully in a variety of NLP tasks in recent years. Despite the
-widespread use of pre-training models for NLP applications, they almost exclusively focus on text-level manipulation,
+widespread use of pretraining models for NLP applications, they almost exclusively focus on text-level manipulation,
 while neglecting layout and style information that is vital for document image understanding. In this paper, we propose
 the \textbf{LayoutLM} to jointly model interactions between text and layout information across scanned document images,
 which is beneficial for a great number of real-world document image understanding tasks such as information extraction
 from scanned documents. Furthermore, we also leverage image features to incorporate words' visual information into
 LayoutLM. To the best of our knowledge, this is the first time that text and layout are jointly learned in a single
-framework for document-level pre-training. It achieves new state-of-the-art results in several downstream tasks,
+framework for document-level pretraining. It achieves new state-of-the-art results in several downstream tasks,
 including form understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image
 classification (from 93.07 to 94.42).*

@@ -45,6 +57,13 @@ LayoutLMTokenizer
    :members:


+LayoutLMTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMTokenizerFast
+    :members:
+
+
 LayoutLMModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/longformer.rst
+++ b/docs/source/model_doc/longformer.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Longformer
 -----------------------------------------------------------------------------------------------------------------------

@@ -22,6 +34,12 @@ contrast to most prior work, we also pretrain Longformer and finetune it on a va
 pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
 WikiHop and TriviaQA.*

+Tips:
+
+- Since the Longformer is based on RoBERTa, it doesn't have :obj:`token_type_ids`. You don't need to indicate which
+  token belongs to which segment. Just separate your segments with the separation token :obj:`tokenizer.sep_token` (or
+  :obj:`</s>`).
+
 The Authors' code can be found `here <https://github.com/allenai/longformer>`__.

 Longformer Self Attention
@@ -93,29 +111,47 @@ LongformerTokenizerFast
 Longformer specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_longformer.LongformerBaseModelOutput
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerBaseModelOutput
    :members: 

-.. autoclass:: transformers.modeling_longformer.LongformerBaseModelOutputWithPooling
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerBaseModelOutputWithPooling
    :members: 

-.. autoclass:: transformers.modeling_longformer.LongformerMultipleChoiceModelOutput
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerMaskedLMOutput
    :members: 

-.. autoclass:: transformers.modeling_longformer.LongformerQuestionAnsweringModelOutput
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerQuestionAnsweringModelOutput
    :members: 

-.. autoclass:: transformers.modeling_tf_longformer.TFLongformerBaseModelOutput
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput
    :members: 

-.. autoclass:: transformers.modeling_tf_longformer.TFLongformerBaseModelOutputWithPooling
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerMultipleChoiceModelOutput
    :members: 

-.. autoclass:: transformers.modeling_tf_longformer.TFLongformerQuestionAnsweringModelOutput
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerTokenClassifierOutput
    :members: 

-LongformerModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerBaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerBaseModelOutputWithPooling
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerMaskedLMOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerQuestionAnsweringModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerSequenceClassifierOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerMultipleChoiceModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerTokenClassifierOutput
+    :members: 

 LongformerModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -179,3 +215,24 @@ TFLongformerForQuestionAnswering
 .. autoclass:: transformers.TFLongformerForQuestionAnswering
    :members: call

+
+TFLongformerForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerForSequenceClassification
+    :members: call
+
+
+TFLongformerForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerForTokenClassification
+    :members: call
+
+
+TFLongformerForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerForMultipleChoice
+    :members: call
+
--- a/docs/source/model_doc/lxmert.rst
+++ b/docs/source/model_doc/lxmert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 LXMERT
 -----------------------------------------------------------------------------------------------------------------------

@@ -19,7 +31,7 @@ Encoder Representations from Transformers) framework to learn these vision-and-l
 build a large-scale Transformer model that consists of three encoders: an object relationship encoder, a language
 encoder, and a cross-modality encoder. Next, to endow our model with the capability of connecting vision and language
 semantics, we pre-train the model with large amounts of image-and-sentence pairs, via five diverse representative
-pre-training tasks: masked language modeling, masked object prediction (feature regression and label classification),
+pretraining tasks: masked language modeling, masked object prediction (feature regression and label classification),
 cross-modality matching, and image question answering. These tasks help in learning both intra-modality and
 cross-modality relationships. After fine-tuning from our pretrained parameters, our model achieves the state-of-the-art
 results on two visual question answering datasets (i.e., VQA and GQA). We also show the generalizability of our
@@ -67,19 +79,19 @@ LxmertTokenizerFast
 Lxmert specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_lxmert.LxmertModelOutput
+.. autoclass:: transformers.models.lxmert.modeling_lxmert.LxmertModelOutput
    :members:

-.. autoclass:: transformers.modeling_lxmert.LxmertForPreTrainingOutput
+.. autoclass:: transformers.models.lxmert.modeling_lxmert.LxmertForPreTrainingOutput
    :members:

-.. autoclass:: transformers.modeling_lxmert.LxmertForQuestionAnsweringOutput
+.. autoclass:: transformers.models.lxmert.modeling_lxmert.LxmertForQuestionAnsweringOutput
    :members:

-.. autoclass:: transformers.modeling_tf_lxmert.TFLxmertModelOutput
+.. autoclass:: transformers.models.lxmert.modeling_tf_lxmert.TFLxmertModelOutput
    :members:

-.. autoclass:: transformers.modeling_tf_lxmert.TFLxmertForPreTrainingOutput
+.. autoclass:: transformers.models.lxmert.modeling_tf_lxmert.TFLxmertForPreTrainingOutput
    :members:


--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 MarianMT
 -----------------------------------------------------------------------------------------------------------------------

@@ -5,7 +17,7 @@ MarianMT
 <https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title>`__
 and assign @patrickvonplaten.

-Translations should be similar, but not identical to, output in the test set linked to in each model card.
+Translations should be similar, but not identical to output in the test set linked to in each model card.

 Implementation Notes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -35,24 +47,109 @@ Naming
  <https://developers.google.com/admin-sdk/directory/v1/languages>`__, three digit codes require googling "language
  code {code}".
 - Codes formatted like :obj:`es_AR` are usually :obj:`code_{region}`. That one is Spanish from Argentina.
+- The models were converted in two stages. The first 1000 models use ISO-639-2 codes to identify languages, the second
+  group use a combination of ISO-639-5 codes and ISO-639-2 codes.


+Examples
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Since Marian models are smaller than many other translation models available in the library, they can be useful for
+  fine-tuning experiments and integration tests.
+- `Fine-tune on TPU
+  <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/builtin_trainer/train_distil_marian_enro_tpu.sh>`__
+- `Fine-tune on GPU
+  <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/builtin_trainer/train_distil_marian_enro.sh>`__
+- `Fine-tune on GPU with pytorch-lightning
+  <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/distil_marian_no_teacher.sh>`__
+
 Multilingual Models
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-All model names use the following format: :obj:`Helsinki-NLP/opus-mt-{src}-{tgt}`:
+- All model names use the following format: :obj:`Helsinki-NLP/opus-mt-{src}-{tgt}`:
+- If a model can output multiple languages, and you should specify a language code by prepending the desired output
+  language to the :obj:`src_text`.
+- You can see a models's supported language codes in its model card, under target constituents, like in `opus-mt-en-roa
+  <https://huggingface.co/Helsinki-NLP/opus-mt-en-roa>`__.
+- Note that if a model is only multilingual on the source side, like :obj:`Helsinki-NLP/opus-mt-roa-en`, no language
+  codes are required.

-    - If :obj:`src` is in all caps, the model supports multiple input languages, you can figure out which ones by
-      looking at the model card, or the Group Members `mapping
-      <https://gist.github.com/sshleifer/6d20e7761931b08e73c3219027b97b8a>`_ .
-    - If :obj:`tgt` is in all caps, the model can output multiple languages, and you should specify a language code by
-      prepending the desired output language to the :obj:`src_text`.
-    - You can see a tokenizer's supported language codes in ``tokenizer.supported_language_codes``
-
-Example of translating english to many romance languages, using language codes:
+New multi-lingual models from the `Tatoeba-Challenge repo <https://github.com/Helsinki-NLP/Tatoeba-Challenge>`__
+require 3 character language codes:

 .. code-block:: python

+    from transformers import MarianMTModel, MarianTokenizer
+    src_text = [
+        '>>fra<< this is a sentence in english that we want to translate to french',
+        '>>por<< This should go to portuguese',
+        '>>esp<< And this to Spanish'
+    ]
+
+    model_name = 'Helsinki-NLP/opus-mt-en-roa'
+    tokenizer = MarianTokenizer.from_pretrained(model_name)
+    print(tokenizer.supported_language_codes)
+    model = MarianMTModel.from_pretrained(model_name)
+    translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
+    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+    # ["c'est une phrase en anglais que nous voulons traduire en français",
+    # 'Isto deve ir para o português.',
+    # 'Y esto al español']
+
+
+
+
+Code to see available pretrained models:
+
+.. code-block:: python
+
+    from transformers.hf_api import HfApi
+    model_list = HfApi().model_list()
+    org = "Helsinki-NLP"
+    model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
+    suffix = [x.split('/')[1] for x in model_ids]
+    old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]
+
+
+
+Old Style Multi-Lingual Models
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language
+group:
+
+.. code-block:: python
+
+    ['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU',
+     'Helsinki-NLP/opus-mt-ROMANCE-en',
+     'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA',
+     'Helsinki-NLP/opus-mt-de-ZH',
+     'Helsinki-NLP/opus-mt-en-CELTIC',
+     'Helsinki-NLP/opus-mt-en-ROMANCE',
+     'Helsinki-NLP/opus-mt-es-NORWAY',
+     'Helsinki-NLP/opus-mt-fi-NORWAY',
+     'Helsinki-NLP/opus-mt-fi-ZH',
+     'Helsinki-NLP/opus-mt-fi_nb_no_nn_ru_sv_en-SAMI',
+     'Helsinki-NLP/opus-mt-sv-NORWAY',
+     'Helsinki-NLP/opus-mt-sv-ZH']
+    GROUP_MEMBERS = {
+     'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'],
+     'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'],
+     'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+     'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+     'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'],
+     'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'],
+     'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv']
+    }
+
+
+
+
+Example of translating english to many romance languages, using old-style 2 character language codes
+
+
+.. code-block::python
+
    from transformers import MarianMTModel, MarianTokenizer
    src_text = [
        '>>fr<< this is a sentence in english that we want to translate to french',
@@ -63,52 +160,12 @@ Example of translating english to many romance languages, using language codes:
    model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    print(tokenizer.supported_language_codes)
+
    model = MarianMTModel.from_pretrained(model_name)
-    translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text))
+    translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
-    # ["c'est une phrase en anglais que nous voulons traduire en français",
-    # 'Isto deve ir para o português.',
-    # 'Y esto al español']
+    # ["c'est une phrase en anglais que nous voulons traduire en français", 'Isto deve ir para o português.',  'Y esto al español']

-Sometimes, models were trained on collections of languages that do not resolve to a group. In this case, _ is used as a
-separator for src or tgt, as in :obj:`Helsinki-NLP/opus-mt-en_el_es_fi-en_el_es_fi`. These still require language
-codes.
-
-There are many supported regional language codes, like :obj:`>>es_ES<<` (Spain) and :obj:`>>es_AR<<` (Argentina), that
-do not seem to change translations. I have not found these to provide different results than just using :obj:`>>es<<`.
-
-For example:
-
-    - `Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU`: translates from all NORTH_EU languages (see `mapping
-      <https://gist.github.com/sshleifer/6d20e7761931b08e73c3219027b97b8a>`_) to all NORTH_EU languages. Use a special
-      language code like :obj:`>>de<<` to specify output language.
-    - `Helsinki-NLP/opus-mt-ROMANCE-en`: translates from many romance languages to english, no codes needed since there
-      is only one target language.
-
-
-
-.. code-block:: python
-
-    GROUP_MEMBERS = {
-     'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'],
-     'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'],
-     'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
-     'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
-     'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'],
-     'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'],
-     'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv']
-    }
-
-Code to see available pretrained models:
-
-.. code-block:: python
-
-    from transformers.hf_api import HfApi
-    model_list = HfApi().model_list()
-    org = "Helsinki-NLP"
-    model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
-    suffix = [x.split('/')[1] for x in model_ids]
-    multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]


 MarianConfig
--- a/docs/source/model_doc/mbart.rst
+++ b/docs/source/model_doc/mbart.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 MBart
 -----------------------------------------------------------------------------------------------------------------------

@@ -13,12 +25,19 @@ The MBart model was presented in `Multilingual Denoising Pre-training for Neural
 Ghazvininejad, Mike Lewis, Luke Zettlemoyer.

 According to the abstract, MBART is a sequence-to-sequence denoising auto-encoder pretrained on large-scale monolingual
-corpora in many languages using the BART objective. mBART is one of the first methods for pre-training a complete
+corpora in many languages using the BART objective. mBART is one of the first methods for pretraining a complete
 sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only
 on the encoder, decoder, or reconstructing parts of the text.

 The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/mbart>`__

+Examples
+_______________________________________________________________________________________________________________________
+
+- Examples and scripts for fine-tuning mBART and other models for sequence to sequence tasks can be found in
+  `examples/seq2seq/ <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/README.md>`__.
+- Given the large embeddings table, mBART consumes a large amount of GPU RAM, especially for fine-tuning.
+  :class:`MarianMTModel` is usually a better choice for bilingual machine translation.

 Training
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -37,12 +56,8 @@ the sequences for sequence-to-sequence fine-tuning.

    example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
    expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-    batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian)
-    input_ids = batch["input_ids"]
-    target_ids = batch["decoder_input_ids"]
-    decoder_input_ids = target_ids[:, :-1].contiguous()
-    labels = target_ids[:, 1:].clone()
-    model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, labels=labels) #forward
+    batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt")
+    model(input_ids=batch['input_ids'], labels=batch['labels']) # forward pass

 - Generation

@@ -55,7 +70,7 @@ the sequences for sequence-to-sequence fine-tuning.
    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
    article = "UN Chief Says There Is No Military Solution in Syria"
-    batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang="en_XX")
+    batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang="en_XX", return_tensors="pt")
    translated_tokens = model.generate(**batch, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
    translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria"
@@ -75,6 +90,13 @@ MBartTokenizer
    :members: build_inputs_with_special_tokens, prepare_seq2seq_batch


+MBartTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartTokenizerFast
+    :members:
+
+
 MBartForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/mobilebert.rst
+++ b/docs/source/model_doc/mobilebert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 MobileBERT
 -----------------------------------------------------------------------------------------------------------------------

@@ -58,10 +70,10 @@ MobileBertTokenizerFast
 MobileBert specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_mobilebert.MobileBertForPreTrainingOutput
+.. autoclass:: transformers.models.mobilebert.modeling_mobilebert.MobileBertForPreTrainingOutput
    :members:

-.. autoclass:: transformers.modeling_tf_mobilebert.TFMobileBertForPreTrainingOutput
+.. autoclass:: transformers.models.mobilebert.modeling_tf_mobilebert.TFMobileBertForPreTrainingOutput
    :members:


--- a/docs/source/model_doc/mpnet.rst
+++ b/docs/source/model_doc/mpnet.rst
@@ -0,0 +1,149 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+MPNet
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The MPNet model was proposed in `MPNet: Masked and Permuted Pre-training for Language Understanding
+<https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+
+MPNet adopts a novel pre-training method, named masked and permuted language modeling, to inherit the advantages of
+masked language modeling and permuted language modeling for natural language understanding.
+
+The abstract from the paper is the following:
+
+*BERT adopts masked language modeling (MLM) for pre-training and is one of the most successful pre-training models.
+Since BERT neglects dependency among predicted tokens, XLNet introduces permuted language modeling (PLM) for
+pre-training to address this problem. However, XLNet does not leverage the full position information of a sentence and
+thus suffers from position discrepancy between pre-training and fine-tuning. In this paper, we propose MPNet, a novel
+pre-training method that inherits the advantages of BERT and XLNet and avoids their limitations. MPNet leverages the
+dependency among predicted tokens through permuted language modeling (vs. MLM in BERT), and takes auxiliary position
+information as input to make the model see a full sentence and thus reducing the position discrepancy (vs. PLM in
+XLNet). We pre-train MPNet on a large-scale dataset (over 160GB text corpora) and fine-tune on a variety of
+down-streaming tasks (GLUE, SQuAD, etc). Experimental results show that MPNet outperforms MLM and PLM by a large
+margin, and achieves better results on these tasks compared with previous state-of-the-art pre-trained methods (e.g.,
+BERT, XLNet, RoBERTa) under the same model setting.*
+
+Tips:
+
+- MPNet doesn't have :obj:`token_type_ids`, you don't need to indicate which token belongs to which segment. just
+  separate your segments with the separation token :obj:`tokenizer.sep_token` (or :obj:`[sep]`).
+
+The original code can be found `here <https://github.com/microsoft/MPNet>`__.
+
+MPNetConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetConfig
+    :members:
+
+
+MPNetTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+MPNetTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetTokenizerFast
+    :members:
+
+
+MPNetModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetModel
+    :members: forward
+
+
+MPNetForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForMaskedLM
+    :members: forward
+
+
+MPNetForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForSequenceClassification
+    :members: forward
+
+
+MPNetForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForMultipleChoice
+    :members: forward
+
+
+MPNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForTokenClassification
+    :members: forward
+
+
+MPNetForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForQuestionAnswering
+    :members: forward
+
+
+TFMPNetModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetModel
+    :members: call
+
+
+TFMPNetForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForMaskedLM
+    :members: call
+
+
+TFMPNetForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForSequenceClassification
+    :members: call
+
+
+TFMPNetForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForMultipleChoice
+    :members: call
+
+
+TFMPNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForTokenClassification
+    :members: call
+
+
+TFMPNetForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/mt5.rst
+++ b/docs/source/model_doc/mt5.rst
@@ -0,0 +1,95 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+MT5
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The mT5 model was presented in `mT5: A massively multilingual pre-trained text-to-text transformer
+<https://arxiv.org/abs/2010.11934>`_ by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya
+Siddhant, Aditya Barua, Colin Raffel.
+
+The abstract from the paper is the following:
+
+*The recent "Text-to-Text Transfer Transformer" (T5) leveraged a unified text-to-text format and scale to attain
+state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a
+multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We describe
+the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual
+benchmarks. All of the code and model checkpoints*
+
+The original code can be found `here <https://github.com/google-research/multilingual-t5>`__.
+
+MT5Config
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5Config
+    :members:
+
+
+MT5Tokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5Tokenizer
+
+See :class:`~transformers.T5Tokenizer` for all details.
+
+
+MT5TokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5TokenizerFast
+
+See :class:`~transformers.T5TokenizerFast` for all details.
+
+
+MT5Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5Model
+    :members:
+
+
+MT5ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5ForConditionalGeneration
+    :members:
+
+
+MT5EncoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5EncoderModel
+    :members:
+
+
+TFMT5Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMT5Model
+    :members:
+
+
+TFMT5ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMT5ForConditionalGeneration
+    :members:
+
+
+TFMT5EncoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMT5EncoderModel
+    :members:
--- a/docs/source/model_doc/pegasus.rst
+++ b/docs/source/model_doc/pegasus.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Pegasus
 -----------------------------------------------------------------------------------------------------------------------

@@ -31,10 +43,19 @@ All the `checkpoints <https://huggingface.co/models?search=pegasus>`__ are fine-
 - Each checkpoint is 2.2 GB on disk and 568M parameters.
 - FP16 is not supported (help/ideas on this appreciated!).
 - Summarizing xsum in fp32 takes about 400ms/sample, with default parameters on a v100 GPU.
- For XSUM, The paper reports rouge1,rouge2, rougeL of paper: 47.21/24.56/39.25. As of Aug 9, this port scores
-  46.91/24.34/39.1.
+- Full replication results and correctly pre-processed data can be found in this `Issue
+  <https://github.com/huggingface/transformers/issues/6844#issue-689259666>`__.
+- `Distilled checkpoints <https://huggingface.co/models?search=distill-pegasus>`__ are described in this `paper
+  <https://arxiv.org/abs/2010.13002>`__.

-The gap is likely because of different alpha/length_penalty implementations in beam search.
+Examples
+_______________________________________________________________________________________________________________________
+
+- `Script <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/finetune_pegasus_xsum.sh>`__ to
+  fine-tune pegasus on the XSUM dataset. Data download instructions at `examples/seq2seq/
+  <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/README.md>`__.
+- FP16 is not supported (help/ideas on this appreciated!).
+- The adafactor optimizer is recommended for pegasus fine-tuning.


 Implementation Notes
@@ -45,7 +66,7 @@ Implementation Notes
 - Some key configuration differences:

    - static, sinusoidal position embeddings
-    - no :obj:`layernorm_embedding` (:obj`PegasusConfig.normalize_embedding=False`)
+    - no :obj:`layernorm_embedding` (:obj:`PegasusConfig.normalize_embedding=False`)
    - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix.
    - more beams are used (:obj:`num_beams=8`)
 - All pretrained pegasus checkpoints are the same besides three attributes: :obj:`tokenizer.model_max_length` (maximum
@@ -69,7 +90,7 @@ Usage Example
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
-    batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest').to(torch_device)
+    batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest', return_tensors="pt").to(torch_device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
@@ -91,6 +112,13 @@ warning: ``add_tokens`` does not work at the moment.
    :members: __call__, prepare_seq2seq_batch


+PegasusTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PegasusTokenizerFast
+    :members:
+
+
 PegasusForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/prophetnet.rst
+++ b/docs/source/model_doc/prophetnet.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 ProphetNet
 -----------------------------------------------------------------------------------------------------------------------

@@ -17,7 +29,7 @@ the next token.

 The abstract from the paper is the following:

-*In this paper, we present a new sequence-to-sequence pre-training model called ProphetNet, which introduces a novel
+*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
 self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
 the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
 n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
@@ -25,7 +37,7 @@ step. The future n-gram prediction explicitly encourages the model to plan for t
 overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
 dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
 abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
-state-of-the-art results on all these datasets compared to the models using the same scale pre-training corpus.*
+state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*

 The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.

@@ -47,16 +59,16 @@ ProphetNetTokenizer
 ProphetNet specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_prophetnet.ProphetNetSeq2SeqLMOutput
+.. autoclass:: transformers.models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqLMOutput
    :members:

-.. autoclass:: transformers.modeling_prophetnet.ProphetNetSeq2SeqModelOutput
+.. autoclass:: transformers.models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqModelOutput
    :members:

-.. autoclass:: transformers.modeling_prophetnet.ProphetNetDecoderModelOutput
+.. autoclass:: transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderModelOutput
    :members:

-.. autoclass:: transformers.modeling_prophetnet.ProphetNetDecoderLMOutput
+.. autoclass:: transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderLMOutput
    :members:

 ProphetNetModel
--- a/docs/source/model_doc/rag.rst
+++ b/docs/source/model_doc/rag.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 RAG
 -----------------------------------------------------------------------------------------------------------------------

@@ -50,10 +62,10 @@ RagTokenizer
 Rag specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_rag.RetrievAugLMMarginOutput
+.. autoclass:: transformers.models.rag.modeling_rag.RetrievAugLMMarginOutput
    :members:

-.. autoclass:: transformers.modeling_rag.RetrievAugLMOutput
+.. autoclass:: transformers.models.rag.modeling_rag.RetrievAugLMOutput
    :members:

 RagRetriever
--- a/docs/source/model_doc/reformer.rst
+++ b/docs/source/model_doc/reformer.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Reformer
 -----------------------------------------------------------------------------------------------------------------------

@@ -151,6 +163,13 @@ ReformerTokenizer
    :members: save_vocabulary


+ReformerTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerTokenizerFast
+    :members:
+
+
 ReformerModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/retribert.rst
+++ b/docs/source/model_doc/retribert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 RetriBERT
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 RoBERTa
 -----------------------------------------------------------------------------------------------------------------------

@@ -146,3 +158,10 @@ TFRobertaForQuestionAnswering

 .. autoclass:: transformers.TFRobertaForQuestionAnswering
    :members: call
+
+
+FlaxRobertaModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxRobertaModel
+    :members: __call__
--- a/docs/source/model_doc/squeezebert.rst
+++ b/docs/source/model_doc/squeezebert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 SqueezeBERT
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 T5
 -----------------------------------------------------------------------------------------------------------------------

@@ -17,7 +29,7 @@ The abstract from the paper is the following:
 task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning
 has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of
 transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a
-text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled datasets, transfer
+text-to-text format. Our systematic study compares pretraining objectives, architectures, unlabeled datasets, transfer
 approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration
 with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering
 summarization, question answering, text classification, and more. To facilitate future work on transfer learning for
@@ -64,7 +76,7 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash
  input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
  labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
  # the forward function automatically creates the correct decoder_input_ids
-  loss = model(input_ids=input_ids, labels=labels, return_dict=True).loss
+  loss = model(input_ids=input_ids, labels=labels).loss

 - Supervised training

@@ -77,7 +89,7 @@ token. T5 can be trained / fine-tuned both in a supervised and unsupervised fash
  input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
  labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
  # the forward function automatically creates the correct decoder_input_ids
-  loss = model(input_ids=input_ids, labels=labels, return_dict=True).loss
+  loss = model(input_ids=input_ids, labels=labels).loss


 T5Config
@@ -95,19 +107,31 @@ T5Tokenizer
        create_token_type_ids_from_sequences, prepare_seq2seq_batch, save_vocabulary


+T5TokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.T5TokenizerFast
+    :members:
+
+
 T5Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.T5Model
-    :members: forward
+    :members: forward, parallelize, deparallelize


 T5ForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.T5ForConditionalGeneration
-    :members: forward
+    :members: forward, parallelize, deparallelize

+T5EncoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.T5EncoderModel
+    :members: forward, parallelize, deparallelize

 TFT5Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -121,3 +145,9 @@ TFT5ForConditionalGeneration

 .. autoclass:: transformers.TFT5ForConditionalGeneration
    :members: call
+
+TFT5EncoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFT5EncoderModel
+    :members: call
--- a/docs/source/model_doc/tapas.rst
+++ b/docs/source/model_doc/tapas.rst
@@ -0,0 +1,434 @@
+TAPAS
+-----------------------------------------------------------------------------------------------------------------------
+
+.. note::
+
+    This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
+    breaking changes to fix them in the future.
+
+
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The TAPAS model was proposed in `TAPAS: Weakly Supervised Table Parsing via Pre-training
+<https://www.aclweb.org/anthology/2020.acl-main.398>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
+Francesco Piccinno and Julian Martin Eisenschlos. It's a BERT-based model specifically designed (and pre-trained) for
+answering questions about tabular data. Compared to BERT, TAPAS uses relative position embeddings and has 7 token types
+that encode tabular structure. TAPAS is pre-trained on the masked language modeling (MLM) objective on a large dataset
+comprising millions of tables from English Wikipedia and corresponding texts. For question answering, TAPAS has 2 heads
+on top: a cell selection head and an aggregation head, for (optionally) performing aggregations (such as counting or
+summing) among selected cells. TAPAS has been fine-tuned on several datasets: `SQA
+<https://www.microsoft.com/en-us/download/details.aspx?id=54253>`__ (Sequential Question Answering by Microsoft), `WTQ
+<https://github.com/ppasupat/WikiTableQuestions>`__ (Wiki Table Questions by Stanford University) and `WikiSQL
+<https://github.com/salesforce/WikiSQL>`__ (by Salesforce). It achieves state-of-the-art on both SQA and WTQ, while
+having comparable performance to SOTA on WikiSQL, with a much simpler architecture.
+
+The abstract from the paper is the following:
+
+*Answering natural language questions over tables is usually seen as a semantic parsing task. To alleviate the
+collection cost of full logical forms, one popular approach focuses on weak supervision consisting of denotations
+instead of logical forms. However, training semantic parsers from weak supervision poses difficulties, and in addition,
+the generated logical forms are only used as an intermediate step prior to retrieving the denotation. In this paper, we
+present TAPAS, an approach to question answering over tables without generating logical forms. TAPAS trains from weak
+supervision, and predicts the denotation by selecting table cells and optionally applying a corresponding aggregation
+operator to such selection. TAPAS extends BERT's architecture to encode tables as input, initializes from an effective
+joint pre-training of text segments and tables crawled from Wikipedia, and is trained end-to-end. We experiment with
+three different semantic parsing datasets, and find that TAPAS outperforms or rivals semantic parsing models by
+improving state-of-the-art accuracy on SQA from 55.1 to 67.2 and performing on par with the state-of-the-art on WIKISQL
+and WIKITQ, but with a simpler model architecture. We additionally find that transfer learning, which is trivial in our
+setting, from WIKISQL to WIKITQ, yields 48.7 accuracy, 4.2 points above the state-of-the-art.*
+
+In addition, the authors have further pre-trained TAPAS to recognize **table entailment**, by creating a balanced
+dataset of millions of automatically created training examples which are learned in an intermediate step prior to
+fine-tuning. The authors of TAPAS call this further pre-training intermediate pre-training (since TAPAS is first
+pre-trained on MLM, and then on another dataset). They found that intermediate pre-training further improves
+performance on SQA, achieving a new state-of-the-art as well as state-of-the-art on `TabFact
+<https://github.com/wenhuchen/Table-Fact-Checking>`__, a large-scale dataset with 16k Wikipedia tables for table
+entailment (a binary classification task). For more details, see their follow-up paper: `Understanding tables with
+intermediate pre-training <https://www.aclweb.org/anthology/2020.findings-emnlp.27/>`__ by Julian Martin Eisenschlos,
+Syrine Krichene and Thomas Müller.
+
+The original code can be found `here <https://github.com/google-research/tapas>`__.
+
+Tips:
+
+- TAPAS is a model that uses relative position embeddings by default (restarting the position embeddings at every cell
+  of the table). Note that this is something that was added after the publication of the original TAPAS paper.
+  According to the authors, this usually results in a slightly better performance, and allows you to encode longer
+  sequences without running out of embeddings. This is reflected in the ``reset_position_index_per_cell`` parameter of
+  :class:`~transformers.TapasConfig`, which is set to ``True`` by default. The default versions of the models available
+  in the `model hub <https://huggingface.co/models?search=tapas>`_ all use relative position embeddings. You can still
+  use the ones with absolute position embeddings by passing in an additional argument ``revision="no_reset"`` when
+  calling the ``.from_pretrained()`` method. Note that it's usually advised to pad the inputs on the right rather than
+  the left.
+- TAPAS is based on BERT, so ``TAPAS-base`` for example corresponds to a ``BERT-base`` architecture. Of course,
+  TAPAS-large will result in the best performance (the results reported in the paper are from TAPAS-large). Results of
+  the various sized models are shown on the `original Github repository <https://github.com/google-research/tapas>`_.
+- TAPAS has checkpoints fine-tuned on SQA, which are capable of answering questions related to a table in a
+  conversational set-up. This means that you can ask follow-up questions such as "what is his age?" related to the
+  previous question. Note that the forward pass of TAPAS is a bit different in case of a conversational set-up: in that
+  case, you have to feed every table-question pair one by one to the model, such that the `prev_labels` token type ids
+  can be overwritten by the predicted `labels` of the model to the previous question. See "Usage" section for more
+  info.
+- TAPAS is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore
+  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
+  with a causal language modeling (CLM) objective are better in that regard.
+
+
+Usage: fine-tuning
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here we explain how you can fine-tune :class:`~transformers.TapasForQuestionAnswering` on your own dataset.
+
+**STEP 1: Choose one of the 3 ways in which you can use TAPAS - or experiment**
+
+Basically, there are 3 different ways in which one can fine-tune :class:`~transformers.TapasForQuestionAnswering`,
+corresponding to the different datasets on which Tapas was fine-tuned:
+
+1. SQA: if you're interested in asking follow-up questions related to a table, in a conversational set-up. For example
+   if you first ask "what's the name of the first actor?" then you can ask a follow-up question such as "how old is
+   he?". Here, questions do not involve any aggregation (all questions are cell selection questions).
+2. WTQ: if you're not interested in asking questions in a conversational set-up, but rather just asking questions
+   related to a table, which might involve aggregation, such as counting a number of rows, summing up cell values or
+   averaging cell values. You can then for example ask "what's the total number of goals Cristiano Ronaldo made in his
+   career?". This case is also called **weak supervision**, since the model itself must learn the appropriate
+   aggregation operator (SUM/COUNT/AVERAGE/NONE) given only the answer to the question as supervision.
+3. WikiSQL-supervised: this dataset is based on WikiSQL with the model being given the ground truth aggregation
+   operator during training. This is also called **strong supervision**. Here, learning the appropriate aggregation
+   operator is much easier.
+
+To summarize:
+
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+| **Task**                           | **Example dataset**  | **Description**                                                                                                   |
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+| Conversational                     | SQA                  | Conversational, only cell selection questions                                                                     |
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+| Weak supervision for aggregation   | WTQ                  | Questions might involve aggregation, and the model must learn this given only the answer as supervision           |
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+| Strong supervision for aggregation | WikiSQL-supervised   | Questions might involve aggregation, and the model must learn this given the gold aggregation operator            |
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+
+Initializing a model with a pre-trained base and randomly initialized classification heads from the model hub can be
+done as follows (be sure to have installed the `torch-scatter dependency <https://github.com/rusty1s/pytorch_scatter>`_
+for your environment):
+
+.. code-block::
+
+        >>> from transformers import TapasConfig, TapasForQuestionAnswering
+
+        >>> # for example, the base sized model with default SQA configuration
+        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base')
+
+        >>> # or, the base sized model with WTQ configuration
+        >>> config = TapasConfig.from_pretrained('google/tapas-base-finetuned-wtq')
+        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+
+        >>> # or, the base sized model with WikiSQL configuration
+        >>> config = TapasConfig('google-base-finetuned-wikisql-supervised')
+        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+
+
+Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also
+experiment by defining any hyperparameters you want when initializing :class:`~transformers.TapasConfig`, and then
+create a :class:`~transformers.TapasForQuestionAnswering` based on that configuration. For example, if you have a
+dataset that has both conversational questions and questions that might involve aggregation, then you can do it this
+way. Here's an example:
+
+.. code-block::
+
+        >>> from transformers import TapasConfig, TapasForQuestionAnswering
+
+        >>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
+        >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True, select_one_column=False)
+        >>> # initializing the pre-trained base sized model with our custom classification heads
+        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+
+What you can also do is start from an already fine-tuned checkpoint. A note here is that the already fine-tuned
+checkpoint on WTQ has some issues due to the L2-loss which is somewhat brittle. See `here
+<https://github.com/google-research/tapas/issues/91#issuecomment-735719340>`__ for more info.
+
+For a list of all pre-trained and fine-tuned TAPAS checkpoints available in the HuggingFace model hub, see `here
+<https://huggingface.co/models?search=tapas>`__.
+
+**STEP 2: Prepare your data in the SQA format**
+
+Second, no matter what you picked above, you should prepare your dataset in the `SQA format
+<https://www.microsoft.com/en-us/download/details.aspx?id=54253>`__. This format is a TSV/CSV file with the following
+columns:
+
+- ``id``: optional, id of the table-question pair, for bookkeeping purposes.
+- ``annotator``: optional, id of the person who annotated the table-question pair, for bookkeeping purposes.
+- ``position``: integer indicating if the question is the first, second, third,... related to the table. Only required
+  in case of conversational setup (SQA). You don't need this column in case you're going for WTQ/WikiSQL-supervised.
+- ``question``: string
+- ``table_file``: string, name of a csv file containing the tabular data
+- ``answer_coordinates``: list of one or more tuples (each tuple being a cell coordinate, i.e. row, column pair that is
+  part of the answer)
+- ``answer_text``: list of one or more strings (each string being a cell value that is part of the answer)
+- ``aggregation_label``: index of the aggregation operator. Only required in case of strong supervision for aggregation
+  (the WikiSQL-supervised case)
+- ``float_answer``: the float answer to the question, if there is one (np.nan if there isn't). Only required in case of
+  weak supervision for aggregation (such as WTQ and WikiSQL)
+
+The tables themselves should be present in a folder, each table being a separate csv file. Note that the authors of the
+TAPAS algorithm used conversion scripts with some automated logic to convert the other datasets (WTQ, WikiSQL) into the
+SQA format. The author explains this `here
+<https://github.com/google-research/tapas/issues/50#issuecomment-705465960>`__. Interestingly, these conversion scripts
+are not perfect (the ``answer_coordinates`` and ``float_answer`` fields are populated based on the ``answer_text``),
+meaning that WTQ and WikiSQL results could actually be improved.
+
+**STEP 3: Convert your data into PyTorch tensors using TapasTokenizer**
+
+Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular
+data), you can then use :class:`~transformers.TapasTokenizer` to convert table-question pairs into :obj:`input_ids`,
+:obj:`attention_mask`, :obj:`token_type_ids` and so on. Again, based on which of the three cases you picked above,
+:class:`~transformers.TapasForQuestionAnswering` requires different inputs to be fine-tuned:
+
+------------------------------------+----------------------------------------------------------------------------------------------+
+| **Task**                           | **Required inputs**                                                                          |
+------------------------------------+----------------------------------------------------------------------------------------------+
+| Conversational                     | ``input_ids``, ``attention_mask``, ``token_type_ids``, ``labels``                            |
+------------------------------------+----------------------------------------------------------------------------------------------+
+| Weak supervision for aggregation   | ``input_ids``, ``attention_mask``, ``token_type_ids``, ``labels``, ``numeric_values``,       |
+|                                    | ``numeric_values_scale``, ``float_answer``                                                   |
+------------------------------------+----------------------------------------------------------------------------------------------+
+| Strong supervision for aggregation | ``input ids``, ``attention mask``, ``token type ids``, ``labels``, ``aggregation_labels``    |
+------------------------------------+----------------------------------------------------------------------------------------------+
+
+:class:`~transformers.TapasTokenizer` creates the ``labels``, ``numeric_values`` and ``numeric_values_scale`` based on
+the ``answer_coordinates`` and ``answer_text`` columns of the TSV file. The ``float_answer`` and ``aggregation_labels``
+are already in the TSV file of step 2. Here's an example:
+
+.. code-block::
+
+        >>> from transformers import TapasTokenizer
+        >>> import pandas as pd
+
+        >>> model_name = 'google/tapas-base'
+        >>> tokenizer = TapasTokenizer.from_pretrained(model_name)
+
+        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
+        >>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+        >>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]]
+        >>> answer_text = [["Brad Pitt"], ["69"], ["209"]]
+        >>> table = pd.DataFrame.from_dict(data)
+        >>> inputs = tokenizer(table=table, queries=queries, answer_coordinates=answer_coordinates, answer_text=answer_text, padding='max_length', return_tensors='pt')
+        >>> inputs
+        {'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]),
+        'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])}
+
+Note that :class:`~transformers.TapasTokenizer` expects the data of the table to be **text-only**. You can use
+``.astype(str)`` on a dataframe to turn it into text-only data. Of course, this only shows how to encode a single
+training example. It is advised to create a PyTorch dataset and a corresponding dataloader:
+
+.. code-block::
+
+        >>> import torch
+        >>> import pandas as pd
+
+        >>> tsv_path = "your_path_to_the_tsv_file"
+        >>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files"
+
+        >>> class TableDataset(torch.utils.data.Dataset):
+        ...     def __init__(self, data, tokenizer):
+        ...         self.data = data
+        ...         self.tokenizer = tokenizer
+        ...
+        ...     def __getitem__(self, idx):
+        ...         item = data.iloc[idx]
+        ...         table = pd.read_csv(table_csv_path + item.table_file).astype(str) # be sure to make your table data text only
+        ...         encoding = self.tokenizer(table=table, 
+        ...                                   queries=item.question, 
+        ...                                   answer_coordinates=item.answer_coordinates, 
+        ...                                   answer_text=item.answer_text,
+        ...                                   truncation=True,
+        ...                                   padding="max_length",
+        ...                                   return_tensors="pt"
+        ...         )
+        ...         # remove the batch dimension which the tokenizer adds by default
+        ...         encoding = {key: val.squeeze(0) for key, val in encoding.items()}
+        ...         # add the float_answer which is also required (weak supervision for aggregation case)
+        ...         encoding["float_answer"] = torch.tensor(item.float_answer) 
+        ...         return encoding
+        ...
+        ...     def __len__(self):
+        ...        return len(self.data)
+
+        >>> data = pd.read_csv(tsv_path, sep='\t')
+        >>> train_dataset = TableDataset(data, tokenizer)
+        >>> train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
+
+Note that here, we encode each table-question pair independently. This is fine as long as your dataset is **not
+conversational**. In case your dataset involves conversational questions (such as in SQA), then you should first group
+together the ``queries``, ``answer_coordinates`` and ``answer_text`` per table (in the order of their ``position``
+index) and batch encode each table with its questions. This will make sure that the ``prev_labels`` token types (see
+docs of :class:`~transformers.TapasTokenizer`) are set correctly. See `this notebook
+<https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb>`__
+for more info.
+
+**STEP 4: Train (fine-tune) TapasForQuestionAnswering**
+
+You can then fine-tune :class:`~transformers.TapasForQuestionAnswering` using native PyTorch as follows (shown here for
+the weak supervision for aggregation case):
+
+.. code-block::
+
+        >>> from transformers import TapasConfig, TapasForQuestionAnswering, AdamW
+
+        >>> # this is the default WTQ configuration
+        >>> config = TapasConfig(
+        ...            num_aggregation_labels = 4,
+        ...            use_answer_as_supervision = True,
+        ...            answer_loss_cutoff = 0.664694,
+        ...            cell_selection_preference = 0.207951,
+        ...            huber_loss_delta = 0.121194,
+        ...            init_cell_selection_weights_to_zero = True,
+        ...            select_one_column = True,
+        ...            allow_empty_column_selection = False,
+        ...            temperature = 0.0352513,
+        ... )
+        >>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
+
+        >>> optimizer = AdamW(model.parameters(), lr=5e-5)
+
+        >>> for epoch in range(2):  # loop over the dataset multiple times
+        ...    for idx, batch in enumerate(train_dataloader):
+        ...         # get the inputs; 
+        ...         input_ids = batch["input_ids"]
+        ...         attention_mask = batch["attention_mask"]
+        ...         token_type_ids = batch["token_type_ids"]
+        ...         labels = batch["labels"]
+        ...         numeric_values = batch["numeric_values"]
+        ...         numeric_values_scale = batch["numeric_values_scale"]
+        ...         float_answer = batch["float_answer"]
+
+        ...         # zero the parameter gradients
+        ...         optimizer.zero_grad()
+
+        ...         # forward + backward + optimize
+        ...         outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, 
+        ...                        labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, 
+        ...                        float_answer=float_answer)
+        ...         loss = outputs.loss
+        ...         loss.backward()
+        ...         optimizer.step()
+
+Usage: inference
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here we explain how you can use :class:`~transformers.TapasForQuestionAnswering` for inference (i.e. making predictions
+on new data). For inference, only ``input_ids``, ``attention_mask`` and ``token_type_ids`` (which you can obtain using
+:class:`~transformers.TapasTokenizer`) have to be provided to the model to obtain the logits. Next, you can use the
+handy ``convert_logits_to_predictions`` method of :class:`~transformers.TapasTokenizer` to convert these into predicted
+coordinates and optional aggregation indices.
+
+However, note that inference is **different** depending on whether or not the setup is conversational. In a
+non-conversational set-up, inference can be done in parallel on all table-question pairs of a batch. Here's an example
+of that:
+
+.. code-block::
+
+        >>> from transformers import TapasTokenizer, TapasForQuestionAnswering
+        >>> import pandas as pd 
+
+        >>> model_name = 'google/tapas-base-finetuned-wtq'
+        >>> model = TapasForQuestionAnswering.from_pretrained(model_name)
+        >>> tokenizer = TapasTokenizer.from_pretrained(model_name)
+
+        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
+        >>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+        >>> table = pd.DataFrame.from_dict(data)
+        >>> inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="pt") 
+        >>> outputs = model(**inputs)
+        >>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
+        ...         inputs, 
+        ...         outputs.logits.detach(), 
+        ...         outputs.logits_aggregation.detach()
+        ... )
+
+        >>> # let's print out the results:
+        >>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
+        >>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
+
+        >>> answers = []
+        >>> for coordinates in predicted_answer_coordinates:
+        ...   if len(coordinates) == 1:
+        ...     # only a single cell:
+        ...     answers.append(table.iat[coordinates[0]])
+        ...   else:
+        ...     # multiple cells
+        ...     cell_values = []
+        ...     for coordinate in coordinates:
+        ...        cell_values.append(table.iat[coordinate])
+        ...     answers.append(", ".join(cell_values))
+
+        >>> display(table)
+        >>> print("")
+        >>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
+        ...   print(query)
+        ...   if predicted_agg == "NONE":
+        ...     print("Predicted answer: " + answer)
+        ...   else:
+        ...     print("Predicted answer: " + predicted_agg + " > " + answer)    
+        What is the name of the first actor?
+        Predicted answer: Brad Pitt
+        How many movies has George Clooney played in?
+        Predicted answer: COUNT > 69
+        What is the total number of movies?
+        Predicted answer: SUM > 87, 53, 69
+
+In case of a conversational set-up, then each table-question pair must be provided **sequentially** to the model, such
+that the ``prev_labels`` token types can be overwritten by the predicted ``labels`` of the previous table-question
+pair. Again, more info can be found in `this notebook
+<https://github.com/NielsRogge/Transformers-Tutorials/blob/master/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb>`__.
+
+
+Tapas specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.tapas.modeling_tapas.TableQuestionAnsweringOutput
+    :members:
+
+
+TapasConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasConfig
+    :members:
+
+
+TapasTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasTokenizer
+    :members: __call__, convert_logits_to_predictions, save_vocabulary
+
+
+TapasModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasModel
+    :members: forward
+
+
+TapasForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasForMaskedLM
+    :members: forward
+
+
+TapasForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasForSequenceClassification
+    :members: forward
+
+
+TapasForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasForQuestionAnswering
+    :members: forward
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Transformer XL
 -----------------------------------------------------------------------------------------------------------------------

@@ -49,16 +61,16 @@ TransfoXLTokenizer
 TransfoXL specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_transfo_xl.TransfoXLModelOutput
+.. autoclass:: transformers.models.transfo_xl.modeling_transfo_xl.TransfoXLModelOutput
    :members:

-.. autoclass:: transformers.modeling_transfo_xl.TransfoXLLMHeadModelOutput
+.. autoclass:: transformers.models.transfo_xl.modeling_transfo_xl.TransfoXLLMHeadModelOutput
    :members:

-.. autoclass:: transformers.modeling_tf_transfo_xl.TFTransfoXLModelOutput
+.. autoclass:: transformers.models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLModelOutput
    :members:

-.. autoclass:: transformers.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput
+.. autoclass:: transformers.models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput
    :members:


@@ -75,6 +87,11 @@ TransfoXLLMHeadModel
 .. autoclass:: transformers.TransfoXLLMHeadModel
    :members: forward

+TransfoXLForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TransfoXLForSequenceClassification
+    :members: forward

 TFTransfoXLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -88,3 +105,11 @@ TFTransfoXLLMHeadModel

 .. autoclass:: transformers.TFTransfoXLLMHeadModel
    :members: call
+
+
+Internal Layers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AdaptiveEmbedding
+
+.. autoclass:: transformers.TFAdaptiveEmbedding
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 XLM
 -----------------------------------------------------------------------------------------------------------------------

@@ -50,7 +62,7 @@ XLMTokenizer
 XLM specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_xlm.XLMForQuestionAnsweringOutput
+.. autoclass:: transformers.models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput
    :members:


--- a/docs/source/model_doc/xlmprophetnet.rst
+++ b/docs/source/model_doc/xlmprophetnet.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 XLM-ProphetNet
 -----------------------------------------------------------------------------------------------------------------------

@@ -19,7 +31,7 @@ just the next token. Its architecture is identical to ProhpetNet, but the model

 The abstract from the paper is the following:

-*In this paper, we present a new sequence-to-sequence pre-training model called ProphetNet, which introduces a novel
+*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
 self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
 the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
 n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
@@ -27,7 +39,7 @@ step. The future n-gram prediction explicitly encourages the model to plan for t
 overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
 dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
 abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
-state-of-the-art results on all these datasets compared to the models using the same scale pre-training corpus.*
+state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*

 The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.

--- a/docs/source/model_doc/xlmroberta.rst
+++ b/docs/source/model_doc/xlmroberta.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 XLM-RoBERTa
 -----------------------------------------------------------------------------------------------------------------------

@@ -50,6 +62,13 @@ XLMRobertaTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+XLMRobertaTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMRobertaTokenizerFast
+    :members:
+
+
 XLMRobertaModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 XLNet
 -----------------------------------------------------------------------------------------------------------------------

@@ -50,46 +62,53 @@ XLNetTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+XLNetTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLNetTokenizerFast
+    :members:
+
+
 XLNet specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_xlnet.XLNetModelOutput
+.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetModelOutput
    :members:

-.. autoclass:: transformers.modeling_xlnet.XLNetLMHeadModelOutput
+.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetLMHeadModelOutput
    :members:

-.. autoclass:: transformers.modeling_xlnet.XLNetForSequenceClassificationOutput
+.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForSequenceClassificationOutput
    :members:

-.. autoclass:: transformers.modeling_xlnet.XLNetForMultipleChoiceOutput
+.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForMultipleChoiceOutput
    :members:

-.. autoclass:: transformers.modeling_xlnet.XLNetForTokenClassificationOutput
+.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForTokenClassificationOutput
    :members:

-.. autoclass:: transformers.modeling_xlnet.XLNetForQuestionAnsweringSimpleOutput
+.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringSimpleOutput
    :members:

-.. autoclass:: transformers.modeling_xlnet.XLNetForQuestionAnsweringOutput
+.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringOutput
    :members:

-.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetModelOutput
+.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetModelOutput
    :members:

-.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetLMHeadModelOutput
+.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetLMHeadModelOutput
    :members:

-.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput
+.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput
    :members:

-.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput
+.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput
    :members:

-.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput
+.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput
    :members:

-.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput
+.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput
    :members:


--- a/docs/source/model_sharing.rst
+++ b/docs/source/model_sharing.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Model sharing and uploading
 =======================================================================================================================

@@ -37,7 +49,7 @@ For instance:

 .. code-block::

-    >>> tokenizer = AutoTokenizer.from_pretrained(
+    >>> model = AutoModel.from_pretrained(
    >>>   "julien-c/EsperBERTo-small",
    >>>   revision="v2.0.1" # tag name, or branch name, or commit hash
    >>> )
@@ -46,37 +58,52 @@ Basic steps
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 In order to upload a model, you'll need to first create a git repo. This repo will live on the model hub, allowing
-users to clone it and you (and your organization members) to push to it. First, you should ensure you are logged in the
-``transformers-cli``:
+users to clone it and you (and your organization members) to push to it.

-Go in a terminal and run the following command. It should be in the virtual environment where you installed 🤗
+You can create a model repo **directly from `the /new page on the website <https://huggingface.co/new>`__.**
+
+Alternatively, you can use the ``transformers-cli``. The next steps describe that process:
+
+Go to a terminal and run the following command. It should be in the virtual environment where you installed 🤗
 Transformers, since that command :obj:`transformers-cli` comes from the library.

-.. code-block::
+.. code-block:: bash

    transformers-cli login


 Once you are logged in with your model hub credentials, you can start building your repositories. To create a repo:

-.. code-block::
+.. code-block:: bash

    transformers-cli repo create your-model-name

-This creates a repo on the model hub, which can be cloned. You can then add/remove from that repo as you would with any
-other git repo.
+This creates a repo on the model hub, which can be cloned.

-.. code-block::
+.. code-block:: bash
+
+    # Make sure you have git-lfs installed
+    # (https://git-lfs.github.com/)
+    git lfs install

    git clone https://huggingface.co/username/your-model-name

-    # Then commit as usual
+When you have your local clone of your repo and lfs installed, you can then add/remove from that clone as you would
+with any other git repo.
+
+.. code-block:: bash
+
+    # Commit as usual
    cd your-model-name
    echo "hello" >> README.md
    git add . && git commit -m "Update from $USER"

-We are intentionally not wrapping git too much, so as to stay intuitive and easy-to-use.
+We are intentionally not wrapping git too much, so that you can go on with the workflow you're used to and the tools
+you already know.

+The only learning curve you might have compared to regular git is the one for git-lfs. The documentation at
+`git-lfs.github.com <https://git-lfs.github.com/>`__ is decent, but we'll work on a tutorial with some tips and tricks
+in the coming weeks!

 Make your model work on all frameworks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -87,7 +114,7 @@ Make your model work on all frameworks
 You probably have your favorite framework, but so will other users! That's why it's best to upload your model with both
 PyTorch `and` TensorFlow checkpoints to make it easier to use (if you skip this step, users will still be able to load
 your model in another framework, but it will be slower, as it will have to be converted on the fly). Don't worry, it's
-super easy to do (and in a future version, it will all be automatic). You will need to install both PyTorch and
+super easy to do (and in a future version, it might all be automatic). You will need to install both PyTorch and
 TensorFlow for this step, but you don't need to worry about the GPU, so it should be very easy. Check the `TensorFlow
 installation page <https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available>`__ and/or the `PyTorch
 installation page <https://pytorch.org/get-started/locally/#start-locally>`__ to see how.
@@ -159,24 +186,25 @@ Or, if you're using the Trainer API
 .. code-block::

    >>> trainer.save_model("path/to/awesome-name-you-picked")
+    >>> tokenizer.save_pretrained("path/to/repo/clone/your-model-name")

 You can then add these files to the staging environment and verify that they have been correctly staged with the ``git
 status`` command:

-.. code-block::
+.. code-block:: bash

    git add --all
    git status

-Finally, the files should be comitted:
+Finally, the files should be committed:

-.. code-block::
+.. code-block:: bash

    git commit -m "First version of the your-model-name model and tokenizer."

 And pushed to the remote:

-.. code-block::
+.. code-block:: bash

    git push

@@ -186,23 +214,20 @@ This will upload the folder containing the weights, tokenizer and configuration
 Add a model card
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-To make sure everyone knows what your model can do, what its limitations and potential bias or ethetical
-considerations, please add a README.md model card to the 🤗 Transformers repo under `model_cards/`. It should then be
-placed in a subfolder with your username or organization, then another subfolder named like your model
-(`awesome-name-you-picked`). Or just click on the "Create a model card on GitHub" button on the model page, it will get
-you directly to the right location. If you need one, `here <https://github.com/huggingface/model_card>`__ is a model
-card template (meta-suggestions are welcome).
+To make sure everyone knows what your model can do, what its limitations, potential bias or ethical considerations are,
+please add a README.md model card to your model repo. You can just create it, or there's also a convenient button
+titled "Add a README.md" on your model page. A model card template can be found `here
+<https://github.com/huggingface/model_card>`__ (meta-suggestions are welcome). model card template (meta-suggestions
+are welcome).
+
+.. note::
+
+    Model cards used to live in the 🤗 Transformers repo under `model_cards/`, but for consistency and scalability we
+    migrated every model card from the repo to its corresponding huggingface.co model repo.

 If your model is fine-tuned from another model coming from the model hub (all 🤗 Transformers pretrained models do),
 don't forget to link to its model card so that people can fully trace how your model was built.

-If you have never made a pull request to the 🤗 Transformers repo, look at the :doc:`contributing guide <contributing>`
-to see the steps to follow.
-
-.. Note::
-
-    You can also send your model card in the folder you uploaded with the CLI by placing it in a `README.md` file
-    inside `path/to/awesome-name-you-picked/`.

 Using your model
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -225,3 +250,49 @@ You may specify a revision by using the ``revision`` flag in the ``from_pretrain
    >>>   "julien-c/EsperBERTo-small",
    >>>   revision="v2.0.1" # tag name, or branch name, or commit hash
    >>> )
+
+Workflow in a Colab notebook
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you're in a Colab notebook (or similar) with no direct access to a terminal, here is the workflow you can use to
+upload your model. You can execute each one of them in a cell by adding a ! at the beginning.
+
+First you need to install `git-lfs` in the environment used by the notebook:
+
+.. code-block:: bash
+
+    sudo apt-get install git-lfs
+
+Then you can use either create a repo directly from `huggingface.co <https://huggingface.co/>`__ , or use the
+:obj:`transformers-cli` to create it:
+
+
+.. code-block:: bash
+
+    transformers-cli login
+    transformers-cli repo create your-model-name
+
+Once it's created, you can clone it and configure it (replace username by your username on huggingface.co):
+
+.. code-block:: bash
+
+    git lfs install
+
+    git clone https://username:password@huggingface.co/username/your-model-name
+    # Alternatively if you have a token,
+    # you can use it instead of your password
+    git clone https://username:token@huggingface.co/username/your-model-name
+
+    cd your-model-name
+    git config --global user.email "email@example.com"
+    # Tip: using the same email than for your huggingface.co account will link your commits to your profile
+    git config --global user.name "Your name"
+
+Once you've saved your model inside, and your clone is setup with the right remote URL, you can add it and push it with
+usual git commands.
+
+.. code-block:: bash
+
+    git add .
+    git commit -m "Initial commit"
+    git push
--- a/docs/source/model_summary.rst
+++ b/docs/source/model_summary.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Summary of the models
 =======================================================================================================================

@@ -527,10 +539,10 @@ Pegasus
 <https://arxiv.org/pdf/1912.08777.pdf>`_, Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.

 Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on
-two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pre-training
+two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pretraining
 objective, called Gap Sentence Generation (GSG).

-  * MLM: encoder input tokens are randomely replaced by a mask tokens and have to be predicted by the encoder (like in
+  * MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in
    BERT)
  * GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a
    causal mask to hide the future words like a regular auto-regressive transformer decoder.
@@ -560,6 +572,7 @@ A framework for translation models, using the same models as BART

 The library provides a version of this model for conditional generation.

+
 T5
 -----------------------------------------------------------------------------------------------------------------------

@@ -592,6 +605,28 @@ For instance, if we have the sentence “My dog is very cute .”, and we decide

 The library provides a version of this model for conditional generation.

+
+MT5
+-----------------------------------------------------------------------------------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=mt5">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-mt5-blueviolet">
+   </a>
+   <a href="model_doc/mt5.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-mt5-blueviolet">
+   </a>
+
+`mT5: A massively multilingual pre-trained text-to-text transformer <https://arxiv.org/abs/2010.11934>`_, Linting Xue
+et al.
+
+The model architecture is same as T5. mT5's pretraining objective includes T5's self-supervised training, but not T5's
+supervised training. mT5 is trained on 101 languages.
+
+The library provides a version of this model for conditional generation.
+
+
 MBart
 -----------------------------------------------------------------------------------------------------------------------

@@ -607,8 +642,8 @@ MBart
 `Multilingual Denoising Pre-training for Neural Machine Translation <https://arxiv.org/abs/2001.08210>`_ by Yinhan Liu,
 Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.

-The model architecture and pre-training objective is same as BART, but MBart is trained on 25 languages and is intended
-for supervised and unsupervised machine translation. MBart is one of the first methods for pre-training a complete
+The model architecture and pretraining objective is same as BART, but MBart is trained on 25 languages and is intended
+for supervised and unsupervised machine translation. MBart is one of the first methods for pretraining a complete
 sequence-to-sequence model by denoising full texts in multiple languages,

 The library provides a version of this model for conditional generation.
@@ -635,7 +670,7 @@ ProphetNet
 `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by
 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.

-ProphetNet introduces a novel *sequence-to-sequence* pre-training objective, called *future n-gram prediction*. In
+ProphetNet introduces a novel *sequence-to-sequence* pretraining objective, called *future n-gram prediction*. In
 future n-gram prediction, the model predicts the next n tokens simultaneously based on previous context tokens at each
 time step instead instead of just the single next token. The future n-gram prediction explicitly encourages the model
 to plan for the future tokens and prevent overfitting on strong local correlations. The model architecture is based on
@@ -660,8 +695,8 @@ XLM-ProphetNet
 `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by
 Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.

-XLM-ProphetNet's model architecture and pre-training objective is same as ProphetNet, but XLM-ProphetNet was
-pre-trained on the cross-lingual dataset `XGLUE <https://arxiv.org/abs/2004.01401>`__.
+XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained
+on the cross-lingual dataset `XGLUE <https://arxiv.org/abs/2004.01401>`__.

 The library provides a pre-trained version of this model for multi-lingual conditional generation and fine-tuned
 versions for headline generation and question generation, respectively.
--- a/docs/source/multilingual.rst
+++ b/docs/source/multilingual.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Multi-lingual models
 =======================================================================================================================

@@ -109,7 +121,7 @@ XLM-RoBERTa
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 XLM-RoBERTa was trained on 2.5TB of newly created clean CommonCrawl data in 100 languages. It provides strong gains
-over previously released multi-lingual models like mBERT or XLM on downstream taks like classification, sequence
+over previously released multi-lingual models like mBERT or XLM on downstream tasks like classification, sequence
 labeling and question answering.

 Two XLM-RoBERTa checkpoints can be used for multi-lingual tasks:
--- a/docs/source/perplexity.rst
+++ b/docs/source/perplexity.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Perplexity of fixed-length models
 =======================================================================================================================

@@ -62,7 +74,7 @@ sliding the context window so that the model has more context when making each p
 This is a closer approximation to the true decomposition of the sequence probability and will typically yield a more
 favorable score. The downside is that it requires a separate forward pass for each token in the corpus. A good
 practical compromise is to employ a strided sliding window, moving the context by larger strides rather than sliding by
-1 token a time. This allows computation to procede much faster while still giving the model a large context to make
+1 token a time. This allows computation to proceed much faster while still giving the model a large context to make
 predictions at each step.

 Example: Calculating perplexity with GPT-2 in 🤗 Transformers
--- a/docs/source/philosophy.rst
+++ b/docs/source/philosophy.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Philosophy
 =======================================================================================================================

--- a/docs/source/preprocessing.rst
+++ b/docs/source/preprocessing.rst
@@ -1,8 +1,19 @@
-Preprocessing data
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+reprocessing data
 =======================================================================================================================

 In this tutorial, we'll explore how to preprocess your data using 🤗 Transformers. The main tool for this is what we
-
 call a :doc:`tokenizer <main_classes/tokenizer>`. You can build one using the tokenizer class associated to the model
 you would like to use, or directly with the :class:`~transformers.AutoTokenizer` class.

@@ -52,7 +63,7 @@ The tokenizer can decode a list of token ids in a proper sentence:
    "[CLS] Hello, I'm a single sentence! [SEP]"

 As you can see, the tokenizer automatically added some special tokens that the model expects. Not all models need
-special tokens; for instance, if we had used` gtp2-medium` instead of `bert-base-cased` to create our tokenizer, we
+special tokens; for instance, if we had used `gpt2-medium` instead of `bert-base-cased` to create our tokenizer, we
 would have seen the same sentence as the original one here. You can disable this behavior (which is only advised if you
 have added those special tokens yourself) by passing ``add_special_tokens=False``.

--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -1,13 +1,25 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Pretrained models
 =======================================================================================================================

 Here is the full list of the currently provided pretrained models together with a short presentation of each model.

-For a list that includes community-uploaded models, refer to `https://huggingface.co/models
+For a list that includes all community-uploaded models, refer to `https://huggingface.co/models
 <https://huggingface.co/models>`__.

 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Architecture       | Shortcut name                                              | Details of the model                                                                                                                  |
+| Architecture       | Model id                                                   | Details of the model                                                                                                                  |
 +====================+============================================================+=======================================================================================================================================+
 | BERT               | ``bert-base-uncased``                                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                    |                                                            | | Trained on lower-cased English text.                                                                                                |
@@ -333,6 +345,12 @@ For a list that includes community-uploaded models, refer to `https://huggingfac
 |                    | ``facebook/bart-large-cnn``                                | | 24-layer, 1024-hidden, 16-heads, 406M parameters       (same as large)                                                              |
 |                    |                                                            | | bart-large base architecture finetuned on cnn summarization task                                                                    |
 +--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| BARThez            | ``moussaKam/barthez``                                      | | 12-layer,  768-hidden, 12-heads, 216M parameters                                                                                    |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/moussaKam/BARThez>`__)                                                                              |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``moussaKam/mbarthez``                                     | | 24-layer, 1024-hidden, 16-heads, 561M parameters                                                                                    |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | DialoGPT           | ``DialoGPT-small``                                         | | 12-layer, 768-hidden, 12-heads, 124M parameters                                                                                     |
 |                    |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
 |                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
--- a/docs/source/quicktour.rst
+++ b/docs/source/quicktour.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Quick tour
 =======================================================================================================================

@@ -182,6 +194,7 @@ and get tensors back. You can specify all of that to the tokenizer:
    ...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    ...     padding=True,
    ...     truncation=True,
+    ...     max_length=512,
    ...     return_tensors="pt"
    ... )
    >>> ## TENSORFLOW CODE
@@ -189,6 +202,7 @@ and get tensors back. You can specify all of that to the tokenizer:
    ...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
    ...     padding=True,
    ...     truncation=True,
+    ...     max_length=512,
    ...     return_tensors="tf"
    ... )

@@ -240,7 +254,9 @@ activations of the model.
           [ 0.08181786, -0.04179301]], dtype=float32)>,)

 The model can return more than just the final activations, which is why the output is a tuple. Here we only asked for
-the final activations, so we get a tuple with one element. .. note::
+the final activations, so we get a tuple with one element.
+
+.. note::

    All 🤗 Transformers models (PyTorch or TensorFlow) return the activations of the model *before* the final activation
    function (like SoftMax) since this final activation function is often fused with the loss.
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -1,4 +1,15 @@
-***********************************************************************************************************************
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Exporting transformers models
 ***********************************************************************************************************************

@@ -70,8 +81,8 @@ inference.
    optimizations afterwards.

 .. note::
-    For more information about the optimizations enabled by ONNXRuntime, please have a look at the (`ONNXRuntime Github
-    <https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers>`_)
+    For more information about the optimizations enabled by ONNXRuntime, please have a look at the `ONNXRuntime Github
+    <https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers>`_.

 Quantization
 -----------------------------------------------------------------------------------------------------------------------
--- a/docs/source/task_summary.rst
+++ b/docs/source/task_summary.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Summary of the tasks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

@@ -89,7 +101,7 @@ each other. The process is the following:
    >>> import torch

    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
-    >>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=True)
+    >>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

    >>> classes = ["not paraphrase", "is paraphrase"]

@@ -122,7 +134,7 @@ each other. The process is the following:
    >>> import tensorflow as tf

    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
-    >>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc", return_dict=True)
+    >>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

    >>> classes = ["not paraphrase", "is paraphrase"]

@@ -211,7 +223,7 @@ Here is an example of question answering using a model and a tokenizer. The proc
    >>> import torch

    >>> tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-    >>> model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad", return_dict=True)
+    >>> model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

    >>> text = r"""
    ... 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
@@ -231,7 +243,9 @@ Here is an example of question answering using a model and a tokenizer. The proc
    ...     input_ids = inputs["input_ids"].tolist()[0]
    ...
    ...     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-    ...     answer_start_scores, answer_end_scores = model(**inputs)
+    ...     outputs = model(**inputs)
+    ...     answer_start_scores = outputs.start_logits
+    ...     answer_end_scores = outputs.end_logits
    ...
    ...     answer_start = torch.argmax(
    ...         answer_start_scores
@@ -253,7 +267,7 @@ Here is an example of question answering using a model and a tokenizer. The proc
    >>> import tensorflow as tf

    >>> tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-    >>> model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad", return_dict=True)
+    >>> model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

    >>> text = r"""
    ... 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
@@ -273,7 +287,9 @@ Here is an example of question answering using a model and a tokenizer. The proc
    ...     input_ids = inputs["input_ids"].numpy()[0]
    ...
    ...     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-    ...     answer_start_scores, answer_end_scores = model(inputs)
+    ...     outputs = model(inputs)
+    ...     answer_start_scores = outputs.start_logits
+    ...     answer_end_scores = outputs.end_logits
    ...
    ...     answer_start = tf.argmax(
    ...         answer_start_scores, axis=1
@@ -301,7 +317,7 @@ Language modeling is the task of fitting a model to a corpus, which can be domai
 transformer-based models are trained using a variant of language modeling, e.g. BERT with masked language modeling,
 GPT-2 with causal language modeling.

-Language modeling can be useful outside of pre-training as well, for example to shift the model distribution to be
+Language modeling can be useful outside of pretraining as well, for example to shift the model distribution to be
 domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset or
 on scientific papers e.g. `LysandreJik/arxiv-nlp <https://huggingface.co/lysandre/arxiv-nlp>`__.

@@ -373,7 +389,7 @@ Here is an example of doing masked language modeling using a model and a tokeniz
    >>> import torch

    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-    >>> model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased", return_dict=True)
+    >>> model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")

    >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."

@@ -389,7 +405,7 @@ Here is an example of doing masked language modeling using a model and a tokeniz
    >>> import tensorflow as tf

    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-    >>> model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased", return_dict=True)
+    >>> model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")

    >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."

@@ -437,7 +453,7 @@ of tokens.
    >>> from torch.nn import functional as F

    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    >>> model = AutoModelWithLMHead.from_pretrained("gpt2", return_dict=True)
+    >>> model = AutoModelWithLMHead.from_pretrained("gpt2")

    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and "

@@ -461,7 +477,7 @@ of tokens.
    >>> import tensorflow as tf

    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    >>> model = TFAutoModelWithLMHead.from_pretrained("gpt2", return_dict=True)
+    >>> model = TFAutoModelWithLMHead.from_pretrained("gpt2")

    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and "

@@ -513,14 +529,14 @@ Here, the model generates a random text with a total maximal length of *50* toke
 concerned, I will"*. The default arguments of ``PreTrainedModel.generate()`` can be directly overridden in the
 pipeline, as is shown above for the argument ``max_length``.

-Here is an example of text generation using ``XLNet`` and its tokenzier.
+Here is an example of text generation using ``XLNet`` and its tokenizer.

 .. code-block::

    >>> ## PYTORCH CODE
    >>> from transformers import AutoModelWithLMHead, AutoTokenizer

-    >>> model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased", return_dict=True)
+    >>> model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased")
    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
@@ -545,7 +561,7 @@ Here is an example of text generation using ``XLNet`` and its tokenzier.
    >>> ## TENSORFLOW CODE
    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer

-    >>> model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased", return_dict=True)
+    >>> model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased")
    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")

    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
@@ -664,7 +680,7 @@ Here is an example of doing named entity recognition, using a model and a tokeni
    >>> from transformers import AutoModelForTokenClassification, AutoTokenizer
    >>> import torch

-    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english", return_dict=True)
+    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

    >>> label_list = [
@@ -692,7 +708,7 @@ Here is an example of doing named entity recognition, using a model and a tokeni
    >>> from transformers import TFAutoModelForTokenClassification, AutoTokenizer
    >>> import tensorflow as tf

-    >>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english", return_dict=True)
+    >>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

    >>> label_list = [
@@ -790,7 +806,7 @@ CNN / Daily Mail), it yields very good results.
    >>> ## PYTORCH CODE
    >>> from transformers import AutoModelWithLMHead, AutoTokenizer

-    >>> model = AutoModelWithLMHead.from_pretrained("t5-base", return_dict=True)
+    >>> model = AutoModelWithLMHead.from_pretrained("t5-base")
    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")

    >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
@@ -799,7 +815,7 @@ CNN / Daily Mail), it yields very good results.
    >>> ## TENSORFLOW CODE
    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer

-    >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base", return_dict=True)
+    >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base")
    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")

    >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
@@ -834,7 +850,7 @@ Here is an example of doing translation using a model and a tokenizer. The proce

 1. Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder
   model, such as ``Bart`` or ``T5``.
-2. Define the article that should be summarizaed.
+2. Define the article that should be summarized.
 3. Add the T5 specific prefix "translate English to German: "
 4. Use the ``PreTrainedModel.generate()`` method to perform the translation.

@@ -843,7 +859,7 @@ Here is an example of doing translation using a model and a tokenizer. The proce
    >>> ## PYTORCH CODE
    >>> from transformers import AutoModelWithLMHead, AutoTokenizer

-    >>> model = AutoModelWithLMHead.from_pretrained("t5-base", return_dict=True)
+    >>> model = AutoModelWithLMHead.from_pretrained("t5-base")
    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")

    >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="pt")
@@ -851,7 +867,7 @@ Here is an example of doing translation using a model and a tokenizer. The proce
    >>> ## TENSORFLOW CODE
    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer

-    >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base", return_dict=True)
+    >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base")
    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")

    >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="tf")
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Testing
 =======================================================================================================================

@@ -405,32 +417,32 @@ decorators are used to set the requirements of tests CPU/GPU/TPU-wise:

 * ``require_torch`` - this test will run only under torch
 * ``require_torch_gpu`` - as ``require_torch`` plus requires at least 1 GPU
-* ``require_torch_multigpu`` - as ``require_torch`` plus requires at least 2 GPUs
-* ``require_torch_non_multigpu`` - as ``require_torch`` plus requires 0 or 1 GPUs
+* ``require_torch_multi_gpu`` - as ``require_torch`` plus requires at least 2 GPUs
+* ``require_torch_non_multi_gpu`` - as ``require_torch`` plus requires 0 or 1 GPUs
 * ``require_torch_tpu`` - as ``require_torch`` plus requires at least 1 TPU

 Let's depict the GPU requirements in the following table:


-+----------+---------------------------------+
-| n gpus   |  decorator                      |
-+==========+=================================+
-| ``>= 0`` | ``@require_torch``              |
-+----------+---------------------------------+
-| ``>= 1`` | ``@require_torch_gpu``          |
-+----------+---------------------------------+
-| ``>= 2`` | ``@require_torch_multigpu``     |
-+----------+---------------------------------+
-| ``< 2``  | ``@require_torch_non_multigpu`` |
-+----------+---------------------------------+
+----------+----------------------------------+
+| n gpus   |  decorator                       |
+==========+==================================+
+| ``>= 0`` | ``@require_torch``               |
+----------+----------------------------------+
+| ``>= 1`` | ``@require_torch_gpu``           |
+----------+----------------------------------+
+| ``>= 2`` | ``@require_torch_multi_gpu``     |
+----------+----------------------------------+
+| ``< 2``  | ``@require_torch_non_multi_gpu`` |
+----------+----------------------------------+


 For example, here is a test that must be run only when there are 2 or more GPUs available and pytorch is installed:

 .. code-block:: python

-    @require_torch_multigpu
-    def test_example_with_multigpu():
+    @require_torch_multi_gpu
+    def test_example_with_multi_gpu():

 If a test requires ``tensorflow`` use the ``require_tf`` decorator. For example:

@@ -454,7 +466,7 @@ last for them to work correctly. Here is an example of the correct usage:
 .. code-block:: python

    @parameterized.expand(...)
-    @require_torch_multigpu
+    @require_torch_multi_gpu
    def test_integration_foo():

 This order problem doesn't exist with ``@pytest.mark.parametrize``, you can put it first or last and it will still
@@ -716,11 +728,11 @@ Temporary files and directories
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 Using unique temporary files and directories are essential for parallel test running, so that the tests won't overwrite
-each other's data. Also we want to get the temp files and directories removed at the end of each test that created
+each other's data. Also we want to get the temporary files and directories removed at the end of each test that created
 them. Therefore, using packages like ``tempfile``, which address these needs is essential.

-However, when debugging tests, you need to be able to see what goes into the temp file or directory and you want to
-know it's exact path and not having it randomized on every test re-run.
+However, when debugging tests, you need to be able to see what goes into the temporary file or directory and you want
+to know it's exact path and not having it randomized on every test re-run.

 A helper class :obj:`transformers.test_utils.TestCasePlus` is best used for such purposes. It's a sub-class of
 :obj:`unittest.TestCase`, so we can easily inherit from it in the test modules.
@@ -736,32 +748,33 @@ Here is an example of its usage:

 This code creates a unique temporary directory, and sets :obj:`tmp_dir` to its location.

-In this and all the following scenarios the temporary directory will be auto-removed at the end of test, unless
-``after=False`` is passed to the helper function.
-
-* Create a temporary directory of my choice and delete it at the end - useful for debugging when you want to monitor a
-  specific directory:
+* Create a unique temporary dir:

 .. code-block:: python

    def test_whatever(self):
-        tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test")
+        tmp_dir = self.get_auto_remove_tmp_dir()

-* Create a temporary directory of my choice and do not delete it at the end---useful for when you want to look at the
-  temp results:
+``tmp_dir`` will contain the path to the created temporary dir. It will be automatically removed at the end of the
+test.
+
+* Create a temporary dir of my choice, ensure it's empty before the test starts and don't empty it after the test.

 .. code-block:: python

    def test_whatever(self):
-        tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test", after=False)
+        tmp_dir = self.get_auto_remove_tmp_dir("./xxx")

-* Create a temporary directory of my choice and ensure to delete it right away---useful for when you disabled deletion
-  in the previous test run and want to make sure the that temporary directory is empty before the new test is run:
+This is useful for debug when you want to monitor a specific directory and want to make sure the previous tests didn't
+leave any data in there.

-.. code-block:: python
+* You can override the default behavior by directly overriding the ``before`` and ``after`` args, leading to one of the
+  following behaviors:

-   def test_whatever(self):
-        tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test", before=True)
+    - ``before=True``: the temporary dir will always be cleared at the beginning of the test.
+    - ``before=False``: if the temporary dir already existed, any existing files will remain there.
+    - ``after=True``: the temporary dir will always be deleted at the end of the test.
+    - ``after=False``: the temporary dir will always be left intact at the end of the test.

 .. note::
   In order to run the equivalent of ``rm -r`` safely, only subdirs of the project repository checkout are allowed if
@@ -815,7 +828,7 @@ or the ``xfail`` way:
    @pytest.mark.xfail
    def test_feature_x():

-Here is how to skip a test based on some internal check inside the test:
+- Here is how to skip a test based on some internal check inside the test:

 .. code-block:: python

@@ -838,7 +851,7 @@ or the ``xfail`` way:
    def test_feature_x():
        pytest.xfail("expected to fail until bug XYZ is fixed")

-Here is how to skip all tests in a module if some import is missing:
+- Here is how to skip all tests in a module if some import is missing:

 .. code-block:: python

@@ -908,9 +921,10 @@ pipelines), then we should run that test in the non-slow test suite. If it's foc
 such as the documentation or the examples, then we should run these tests in the slow test suite. And then, to refine
 this approach we should have exceptions:

-* All tests that need to download a heavy set of weights (e.g., model or tokenizer integration tests, pipeline
-  integration tests) should be set to slow. If you're adding a new model, you should create and upload to the hub a
-  tiny version of it (with random weights) for integration tests. This is discussed in the following paragraphs.
+* All tests that need to download a heavy set of weights or a dataset that is larger than ~50MB (e.g., model or
+  tokenizer integration tests, pipeline integration tests) should be set to slow. If you're adding a new model, you
+  should create and upload to the hub a tiny version of it (with random weights) for integration tests. This is
+  discussed in the following paragraphs.
 * All tests that need to do a training not specifically optimized to be fast should be set to slow.
 * We can introduce exceptions if some of these should-be-non-slow tests are excruciatingly slow, and set them to
  ``@slow``. Auto-modeling tests, which save and load large files to disk, are a good example of tests that are marked
@@ -1054,7 +1068,7 @@ If you need to validate the output of a logger, you can use :obj:`CaptureLogger`

    msg = "Testing 1, 2, 3"
    logging.set_verbosity_info()
-    logger = logging.get_logger("transformers.tokenization_bart")
+    logger = logging.get_logger("transformers.models.bart.tokenization_bart")
    with CaptureLogger(logger) as cl:
        logger.info(msg)
    assert cl.out, msg+"\n"
@@ -1128,3 +1142,66 @@ To start a debugger at the point of the warning, do this:
 .. code-block:: bash

    pytest tests/test_logging.py -W error::UserWarning --pdb
+
+
+
+Testing Experimental CI Features
+-----------------------------------------------------------------------------------------------------------------------
+
+Testing CI features can be potentially problematic as it can interfere with the normal CI functioning. Therefore if a
+new CI feature is to be added, it should be done as following.
+
+1. Create a new dedicated job that tests what needs to be tested
+2. The new job must always succeed so that it gives us a green ✓ (details below).
+3. Let it run for some days to see that a variety of different PR types get to run on it (user fork branches,
+   non-forked branches, branches originating from github.com UI direct file edit, various forced pushes, etc. - there
+   are so many) while monitoring the experimental job's logs (not the overall job green as it's purposefully always
+   green)
+4. When it's clear that everything is solid, then merge the new changes into existing jobs.
+
+That way experiments on CI functionality itself won't interfere with the normal workflow.
+
+Now how can we make the job always succeed while the new CI feature is being developed?
+
+Some CIs, like TravisCI support ignore-step-failure and will report the overall job as successful, but CircleCI and
+Github Actions as of this writing don't support that.
+
+So the following workaround can be used:
+
+1. ``set +euo pipefail`` at the beginning of the run command to suppress most potential failures in the bash script.
+2. the last command must be a success: ``echo "done"`` or just ``true`` will do
+
+Here is an example:
+
+.. code-block:: yaml
+
+    - run:
+        name: run CI experiment
+        command: |
+            set +euo pipefail
+            echo "setting run-all-despite-any-errors-mode"
+            this_command_will_fail
+            echo "but bash continues to run"
+            # emulate another failure
+            false
+            # but the last command must be a success
+            echo "during experiment do not remove: reporting success to CI, even if there were failures"
+
+For simple commands you could also do:
+
+.. code-block:: bash
+
+    cmd_that_may_fail || true
+
+Of course, once satisfied with the results, integrate the experimental step or job with the rest of the normal jobs,
+while removing ``set +euo pipefail`` or any other things you may have added to ensure that the experimental job doesn't
+interfere with the normal CI functioning.
+
+This whole process would have been much easier if we only could set something like ``allow-failure`` for the
+experimental step, and let it fail without impacting the overall status of PRs. But as mentioned earlier CircleCI and
+Github Actions don't support it at the moment.
+
+You can vote for this feature and see where it is at at these CI-specific threads:
+
+* `Github Actions: <https://github.com/actions/toolkit/issues/399>`__
+* `CircleCI: <https://ideas.circleci.com/ideas/CCI-I-344>`__
--- a/docs/source/tokenizer_summary.rst
+++ b/docs/source/tokenizer_summary.rst
@@ -1,223 +1,255 @@
-Tokenizer summary
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Summary of the tokenizers
 -----------------------------------------------------------------------------------------------------------------------

-In this page, we will have a closer look at tokenization. As we saw in :doc:`the preprocessing tutorial
-<preprocessing>`, tokenizing a text is splitting it into words or subwords, which then are converted to ids. The second
-part is pretty straightforward, here we will focus on the first part. More specifically, we will look at the three main
-different kinds of tokenizers used in 🤗 Transformers: :ref:`Byte-Pair Encoding (BPE) <byte-pair-encoding>`,
-:ref:`WordPiece <wordpiece>` and :ref:`SentencePiece <sentencepiece>`, and provide examples of models using each of
-those.
+On this page, we will have a closer look at tokenization. As we saw in :doc:`the preprocessing tutorial
+<preprocessing>`, tokenizing a text is splitting it into words or subwords, which then are converted to ids through a
+look-up table. Converting words or subwords to ids is straightforward, so in this summary, we will focus on splitting a
+text into words or subwords (i.e. tokenizing a text). More specifically, we will look at the three main types of
+tokenizers used in 🤗 Transformers: :ref:`Byte-Pair Encoding (BPE) <byte-pair-encoding>`, :ref:`WordPiece <wordpiece>`,
+and :ref:`SentencePiece <sentencepiece>`, and show exemplary which tokenizer type is used by which model.

-Note that on each model page, you can look at the documentation of the associated tokenizer to know which of those
-algorithms the pretrained model used. For instance, if we look at :class:`~transformers.BertTokenizer`, we can see it's
-using :ref:`WordPiece <wordpiece>`.
+Note that on each model page, you can look at the documentation of the associated tokenizer to know which tokenizer
+type was used by the pretrained model. For instance, if we look at :class:`~transformers.BertTokenizer`, we can see
+that the model uses :ref:`WordPiece <wordpiece>`.

-Introduction to tokenization
+Introduction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Splitting a text in smaller chunks is a task that's harder than it looks, and there are multiple ways of doing it. For
-instance, let's look at the sentence "Don't you love 🤗 Transformers? We sure do." A first simple way of tokenizing this
-text is just to split it by spaces, which would give:
+Splitting a text into smaller chunks is a task that is harder than it looks, and there are multiple ways of doing so.
+For instance, let's look at the sentence ``"Don't you love 🤗 Transformers? We sure do."`` A simple way of tokenizing
+this text is to split it by spaces, which would give:

 .. code-block::

    ["Don't", "you", "love", "🤗", "Transformers?", "We", "sure", "do."]

-This is a nice first step, but if we look at the tokens "Transformers?" or "do.", we can see we can do better. Those
-will be different than the tokens "Transformers" and "do" for our model, so we should probably take the punctuation
-into account. This would give:
+This is a sensible first step, but if we look at the tokens ``"Transformers?"`` and ``"do."``, we notice that the
+punctuation is attached to the words ``"Transformer"`` and ``"do"``, which is suboptimal. We should take the
+punctuation into account so that a model does not have to learn a different representation of a word and every possible
+punctuation symbol that could follow it, which would explode the number of representations the model has to learn.
+Taking punctuation into account, tokenizing our exemplary text would give:

 .. code-block::

    ["Don", "'", "t", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]

-which is better already. One thing that is annoying though is how it dealt with "Don't". "Don't" stands for do not, so
-it should probably be better tokenized as ``["Do", "n't"]``. This is where things start getting more complicated, and
-part of the reason each kind of model has its own tokenizer class. Depending on the rules we apply to split our texts
-into tokens, we'll get different tokenized versions of the same text. And of course, a given pretrained model won't
-perform properly if you don't use the exact same rules as the persons who pretrained it.
+Better. However, it is disadvantageous, how the tokenization dealt with the word ``"Don't"``. ``"Don't"`` stands for
+``"do not"``, so it would be better tokenized as ``["Do", "n't"]``. This is where things start getting complicated, and
+part of the reason each model has its own tokenizer type. Depending on the rules we apply for tokenizing a text, a
+different tokenized output is generated for the same text. A pretrained model only performs properly if you feed it an
+input that was tokenized with the same rules that were used to tokenize its training data.

 `spaCy <https://spacy.io/>`__ and `Moses <http://www.statmt.org/moses/?n=Development.GetStarted>`__ are two popular
-rule-based tokenizers. On the text above, they'd output something like:
+rule-based tokenizers. Applying them on our example, *spaCy* and *Moses* would output something like:

 .. code-block::

    ["Do", "n't", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]

-Space/punctuation-tokenization and rule-based tokenization are both examples of word tokenization, which is splitting a
-sentence into words. While it's the most intuitive way to separate texts in smaller chunks, it can have a problem when
-you have a huge corpus: it usually yields a very big vocabulary (the set of all unique tokens used). :doc:`Transformer
-XL <model_doc/transformerxl>` for instance uses space/punctuation-tokenization, and has a vocabulary size of 267,735!
+As can be seen space and punctuation tokenization, as well as rule-based tokenization, is used here. Space and
+punctuation tokenization and rule-based tokenization are both examples of word tokenization, which is loosely defined
+as splitting sentences into words. While it's the most intuitive way to split texts into smaller chunks, this
+tokenization method can lead to problems for massive text corpora. In this case, space and punctuation tokenization
+usually generates a very big vocabulary (the set of all unique words and tokens used). *E.g.*, :doc:`Transformer XL
+<model_doc/transformerxl>` uses space and punctuation tokenization, resulting in a vocabulary size of 267,735!

-A huge vocabulary size means a huge embedding matrix at the start of the model, which will cause memory problems.
-TransformerXL deals with it by using a special kind of embeddings called adaptive embeddings, but in general,
-transformers models rarely have a vocabulary size greater than 50,000, especially if they are trained on a single
-language.
+Such a big vocabulary size forces the model to have an enormous embedding matrix as the input and output layer, which
+causes both an increased memory and time complexity. In general, transformers models rarely have a vocabulary size
+greater than 50,000, especially if they are pretrained only on a single language.

-So if tokenizing on words is unsatisfactory, we could go on the opposite direction and simply tokenize on characters.
-While it's very simple and would save a lot of memory, this doesn't allow the model to learn representations of texts
-as meaningful as when using a word tokenization, leading to a loss of performance. So to get the best of both worlds,
-all transformers models use a hybrid between word-level and character-level tokenization called subword tokenization.
+So if simple space and punctuation tokenization is unsatisfactory, why not simply tokenize on characters? While
+character tokenization is very simple and would greatly reduce memory and time complexity it makes it much harder for
+the model to learn meaningful input representations. *E.g.* learning a meaningful context-independent representation
+for the letter ``"t"`` is much harder as learning a context-independent representation for the word ``"today"``.
+Therefore, character tokenization is often accompanied by a loss of performance. So to get the best of both worlds,
+transformers models use a hybrid between word-level and character-level tokenization called **subword** tokenization.

 Subword tokenization
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Subword tokenization algorithms rely on the principle that most common words should be left as is, but rare words
-should be decomposed in meaningful subword units. For instance "annoyingly" might be considered a rare word and
-decomposed as "annoying" and "ly". This is especially useful in agglutinative languages such as Turkish, where you can
-form (almost) arbitrarily long complex words by stringing together some subwords.
+Subword tokenization algorithms rely on the principle that frequently used words should not be split into smaller
+subwords, but rare words should be decomposed into meaningful subwords. For instance ``"annoyingly"`` might be
+considered a rare word and could be decomposed into ``"annoying"`` and ``"ly"``. Both ``"annoying"`` and ``"ly"`` as
+stand-alone subwords would appear more frequently while at the same time the meaning of ``"annoyingly"`` is kept by the
+composite meaning of ``"annoying"`` and ``"ly"``. This is especially useful in agglutinative languages such as Turkish,
+where you can form (almost) arbitrarily long complex words by stringing together subwords.

-This allows the model to keep a reasonable vocabulary while still learning useful representations for common words or
-subwords. This also enables the model to process words it has never seen before, by decomposing them into subwords it
-knows. For instance, the base :class:`~transformers.BertTokenizer` will tokenize "I have a new GPU!" like this:
+Subword tokenization allows the model to have a reasonable vocabulary size while being able to learn meaningful
+context-independent representations. In addition, subword tokenization enables the model to process words it has never
+seen before, by decomposing them into known subwords. For instance, the :class:`~transformers.BertTokenizer` tokenizes
+``"I have a new GPU!"`` as follows:

 .. code-block::

    >>> from transformers import BertTokenizer
-    >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    >>> tokenizer.tokenize("I have a new GPU!")
-    ['i', 'have', 'a', 'new', 'gp', '##u', '!']
+    ["i", "have", "a", "new", "gp", "##u", "!"]

-Since we are considering the uncased model, the sentence was lowercased first. Then all the words were present in the
-vocabulary of the tokenizer, except for "gpu", so the tokenizer splits it in subwords it knows: "gp" and "##u". The
-"##" means that the rest of the token should be attached to the previous one, without space (for when we need to decode
-predictions and reverse the tokenization).
+Because we are considering the uncased model, the sentence was lowercased first. We can see that the words ``["i",
+"have", "a", "new"]`` are present in the tokenizer's vocabulary, but the word ``"gpu"`` is not. Consequently, the
+tokenizer splits ``"gpu"`` into known subwords: ``["gp" and "##u"]``. ``"##"`` means that the rest of the token should
+be attached to the previous one, without space (for decoding or reversal of the tokenization).

-Another example is when we use the base :class:`~transformers.XLNetTokenizer` to tokenize our previous text:
+As another example, :class:`~transformers.XLNetTokenizer` tokenizes our previously exemplary text as follows:

 .. code-block::

    >>> from transformers import XLNetTokenizer
-    >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
+    >>> tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
    >>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
-    ['▁Don', "'", 't', '▁you', '▁love', '▁', '🤗', '▁', 'Transform', 'ers', '?', '▁We', '▁sure', '▁do', '.']
+    ["▁Don", "'", "t", "▁you", "▁love", "▁", "🤗", "▁", "Transform", "ers", "?", "▁We", "▁sure", "▁do", "."]

-We'll get back to the meaning of those '▁' when we look at :ref:`SentencePiece <sentencepiece>` but you can see
-Transformers has been split into "Transform" and "ers".
+We'll get back to the meaning of those ``"▁"`` when we look at :ref:`SentencePiece <sentencepiece>`. As one can see,
+the rare word ``"Transformers"`` has been split into the more frequent subwords ``"Transform"`` and ``"ers"``.

-Let's now look at how the different subword tokenization algorithms work. Note that they all rely on some form of
-training which is usually done on the corpus the corresponding model will be trained on.
+Let's now look at how the different subword tokenization algorithms work. Note that all of those tokenization
+algorithms rely on some form of training which is usually done on the corpus the corresponding model will be trained
+on.

 .. _byte-pair-encoding:

-Byte-Pair Encoding
+Byte-Pair Encoding (BPE)
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Byte-Pair Encoding was introduced in `this paper <https://arxiv.org/abs/1508.07909>`__. It relies on a pretokenizer
-splitting the training data into words, which can be a simple space tokenization (:doc:`GPT-2 <model_doc/gpt2>` and
-:doc:`Roberta <model_doc/roberta>` uses this for instance) or a rule-based tokenizer (:doc:`XLM <model_doc/xlm>` use
-Moses for most languages, as does :doc:`FlauBERT <model_doc/flaubert>`),
+Byte-Pair Encoding (BPE) was introduced in `Neural Machine Translation of Rare Words with Subword Units (Sennrich et
+al., 2015) <https://arxiv.org/abs/1508.07909>`__. BPE relies on a pre-tokenizer that splits the training data into
+words. Pretokenization can be as simple as space tokenization, e.g. :doc:`GPT-2 <model_doc/gpt2>`, :doc:`Roberta
+<model_doc/roberta>`. More advanced pre-tokenization include rule-based tokenization, e.g. :doc:`XLM <model_doc/xlm>`,
+:doc:`FlauBERT <model_doc/flaubert>` which uses Moses for most languages, or :doc:`GPT <model_doc/gpt>` which uses
+Spacy and ftfy, to count the frequency of each word in the training corpus.

-:doc:`GPT <model_doc/gpt>` uses Spacy and ftfy, and counts the frequency of each word in the training corpus.
+After pre-tokenization, a set of unique words has been created and the frequency of each word it occurred in the
+training data has been determined. Next, BPE creates a base vocabulary consisting of all symbols that occur in the set
+of unique words and learns merge rules to form a new symbol from two symbols of the base vocabulary. It does so until
+the vocabulary has attained the desired vocabulary size. Note that the desired vocabulary size is a hyperparameter to
+define before training the tokenizer.

-It then begins from the list of all characters and will learn merge rules to form a new token from two symbols in the
-vocabulary until it has learned a vocabulary of the desired size (this is a hyperparameter to pick).
-
-Let's say that after the pre-tokenization we have the following words (the number indicating the frequency of each
-word):
+As an example, let's assume that after pre-tokenization, the following set of words including their frequency has been
+determined:

 .. code-block::

-    ('hug', 10), ('pug', 5), ('pun', 12), ('bun', 4), ('hugs', 5)
+    ("hug", 10), ("pug", 5), ("pun", 12), ("bun", 4), ("hugs", 5)

-Then the base vocabulary is ['b', 'g', 'h', 'n', 'p', 's', 'u'] and all our words are first split by character:
+Consequently, the base vocabulary is ``["b", "g", "h", "n", "p", "s", "u"]``. Splitting all words into symbols of the
+base vocabulary, we obtain:

 .. code-block::

-    ('h' 'u' 'g', 10), ('p' 'u' 'g', 5), ('p' 'u' 'n', 12), ('b' 'u' 'n', 4), ('h' 'u' 'g' 's', 5)
+    ("h" "u" "g", 10), ("p" "u" "g", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "u" "g" "s", 5)

-We then take each pair of symbols and look at the most frequent. For instance 'hu' is present `10 + 5 = 15` times (10
-times in the 10 occurrences of 'hug', 5 times in the 5 occurrences of 'hugs'). The most frequent here is 'ug', present
-`10 + 5 + 5 = 20` times in total. So the first merge rule the tokenizer learns is to group all 'u' and 'g' together
-then it adds 'ug' to the vocabulary. Our corpus then becomes
+BPE then counts the frequency of each possible symbol pair and picks the symbol pair that occurs most frequently. In
+the example above ``"h"`` followed by ``"u"`` is present `10 + 5 = 15` times (10 times in the 10 occurrences of
+``"hug"``, 5 times in the 5 occurrences of "hugs"). However, the most frequent symbol pair is ``"u"`` followed by "g",
+occurring `10 + 5 + 5 = 20` times in total. Thus, the first merge rule the tokenizer learns is to group all ``"u"``
+symbols followed by a ``"g"`` symbol together. Next, "ug" is added to the vocabulary. The set of words then becomes

 .. code-block::

-    ('h' 'ug', 10), ('p' 'ug', 5), ('p' 'u' 'n', 12), ('b' 'u' 'n', 4), ('h' 'ug' 's', 5)
+    ("h" "ug", 10), ("p" "ug", 5), ("p" "u" "n", 12), ("b" "u" "n", 4), ("h" "ug" "s", 5)

-and we continue by looking at the next most common pair of symbols. It's 'un', present 16 times, so we merge those two
-and add 'un' to the vocabulary. Then it's 'hug' (as 'h' + 'ug'), present 15 times, so we merge those two and add 'hug'
-to the vocabulary.
+BPE then identifies the next most common symbol pair. It's ``"u"`` followed by ``"n"``, which occurs 16 times. ``"u"``,
+``"n"`` is merged to ``"un"`` and added to the vocabulary. The next most frequent symbol pair is ``"h"`` followed by
+``"ug"``, occurring 15 times. Again the pair is merged and ``"hug"`` can be added to the vocabulary.

-At this stage, the vocabulary is ``['b', 'g', 'h', 'n', 'p', 's', 'u', 'ug', 'un', 'hug']`` and our corpus is
-represented as
+At this stage, the vocabulary is ``["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"]`` and our set of unique words
+is represented as

 .. code-block::

-    ('hug', 10), ('p' 'ug', 5), ('p' 'un', 12), ('b' 'un', 4), ('hug' 's', 5)
+    ("hug", 10), ("p" "ug", 5), ("p" "un", 12), ("b" "un", 4), ("hug" "s", 5)

-If we stop there, the tokenizer can apply the rules it learned to new words (as long as they don't contain characters
-that were not in the base vocabulary). For instance 'bug' would be tokenized as ``['b', 'ug']`` but mug would be
-tokenized as ``['<unk>', 'ug']`` since the 'm' is not in the base vocabulary. This doesn't happen to letters in general
-(since the base corpus uses all of them), but to special characters like emojis.
+Assuming, that the Byte-Pair Encoding training would stop at this point, the learned merge rules would then be applied
+to new words (as long as those new words do not include symbols that were not in the base vocabulary). For instance,
+the word ``"bug"`` would be tokenized to ``["b", "ug"]`` but ``"mug"`` would be tokenized as ``["<unk>", "ug"]`` since
+the symbol ``"m"`` is not in the base vocabulary. In general, single letters such as ``"m"`` are not replaced by the
+``"<unk>"`` symbol because the training data usually includes at least one occurrence of each letter, but it is likely
+to happen for very special characters like emojis.

-As we said before, the vocabulary size (which is the base vocabulary size + the number of merges) is a hyperparameter
+As mentioned earlier, the vocabulary size, *i.e.* the base vocabulary size + the number of merges, is a hyperparameter
 to choose. For instance :doc:`GPT <model_doc/gpt>` has a vocabulary size of 40,478 since they have 478 base characters
-and chose to stop the training of the tokenizer at 40,000 merges.
+and chose to stop training after 40,000 merges.

 Byte-level BPE
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-To deal with the fact the base vocabulary needs to get all base characters, which can be quite big if one allows for
-all unicode characters, the `GPT-2 paper
-<https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`__ introduces a
-clever trick, which is to use bytes as the base vocabulary (which gives a size of 256). With some additional rules to
-deal with punctuation, this manages to be able to tokenize every text without needing an unknown token. For instance,
-the :doc:`GPT-2 model <model_doc/gpt>` has a vocabulary size of 50,257, which corresponds to the 256 bytes base tokens,
-a special end-of-text token and the symbols learned with 50,000 merges.
+A base vocabulary that includes all possible base characters can be quite large if *e.g.* all unicode characters are
+considered as base characters. To have a better base vocabulary, `GPT-2
+<https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`__ uses bytes
+as the base vocabulary, which is a clever trick to force the base vocabulary to be of size 256 while ensuring that
+every base character is included in the vocabulary. With some additional rules to deal with punctuation, the GPT2's
+tokenizer can tokenize every text without the need for the <unk> symbol. :doc:`GPT-2 <model_doc/gpt>` has a vocabulary
+size of 50,257, which corresponds to the 256 bytes base tokens, a special end-of-text token and the symbols learned
+with 50,000 merges.

 .. _wordpiece:

 WordPiece
 =======================================================================================================================

-WordPiece is the subword tokenization algorithm used for :doc:`BERT <model_doc/bert>` (as well as :doc:`DistilBERT
-<model_doc/distilbert>` and :doc:`Electra <model_doc/electra>`) and was outlined in `this paper
-<https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf>`__. It relies on the same
-base as BPE, which is to initialize the vocabulary to every character present in the corpus and progressively learn a
-given number of merge rules, the difference is that it doesn't choose the pair that is the most frequent but the one
-that will maximize the likelihood on the corpus once merged.
+WordPiece is the subword tokenization algorithm used for :doc:`BERT <model_doc/bert>`, :doc:`DistilBERT
+<model_doc/distilbert>`, and :doc:`Electra <model_doc/electra>`. The algorithm was outlined in `Japanese and Korean
+Voice Seach (Schuster et al., 2012)
+<https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf>`__ and is very similar to
+BPE. WordPiece first initializes the vocabulary to include every character present in the training data and
+progressively learn a given number of merge rules. In contrast to BPE, WordPiece does not choose the most frequent
+symbol pair, but the one that maximizes the likelihood of the training data once added to the vocabulary.

-What does this mean? Well, in the previous example, it means we would only merge 'u' and 'g' if the probability of
-having 'ug' divided by the probability of having 'u' then 'g' is greater than for any other pair of symbols. It's
-subtly different from what BPE does in the sense that it evaluates what it "loses" by merging two symbols and makes
-sure it's `worth it`.
+So what does this mean exactly? Referring to the previous example, maximizing the likelihood of the training data is
+equivalent to finding the symbol pair, whose probability divided by the probabilities of its first symbol followed by
+its second symbol is the greatest among all symbol pairs. *E.g.* ``"u"``, followed by ``"g"`` would have only been
+merged if the probability of ``"ug"`` divided by ``"u"``, ``"g"`` would have been greater than for any other symbol
+pair. Intuitively, WordPiece is slightly different to BPE in that it evaluates what it `loses` by merging two symbols
+to make ensure it's `worth it`.

 .. _unigram:

 Unigram
 =======================================================================================================================

-Unigram is a subword tokenization algorithm introduced in `this paper <https://arxiv.org/pdf/1804.10959.pdf>`__.
-Instead of starting with a group of base symbols and learning merges with some rule, like BPE or WordPiece, it starts
-from a large vocabulary (for instance, all pretokenized words and the most common substrings) that it will trim down
-progressively. It's not used directly for any of the pretrained models in the library, but it's used in conjunction
-with :ref:`SentencePiece <sentencepiece>`.
+Unigram is a subword tokenization algorithm introduced in `Subword Regularization: Improving Neural Network Translation
+Models with Multiple Subword Candidates (Kudo, 2018) <https://arxiv.org/pdf/1804.10959.pdf>`__. In contrast to BPE or
+WordPiece, Unigram initializes its base vocabulary to a large number of symbols and progressively trims down each
+symbol to obtain a smaller vocabulary. The base vocabulary could for instance correspond to all pre-tokenized words and
+the most common substrings. Unigram is not used directly for any of the models in the transformers, but it's used in
+conjunction with :ref:`SentencePiece <sentencepiece>`.

-More specifically, at a given step, unigram computes a loss from the corpus we have and the current vocabulary, then,
-for each subword, evaluate how much the loss would increase if the subword was removed from the vocabulary. It then
-sorts the subwords by this quantity (that represents how much worse the loss becomes if the token is removed) and
-removes all the worst p tokens (for instance p could be 10% or 20%). It then repeats the process until the vocabulary
-has reached the desired size, always keeping the base characters (to be able to tokenize any word written with them,
-like BPE or WordPiece).
+At each training step, the Unigram algorithm defines a loss (often defined as the log-likelihood) over the training
+data given the current vocabulary and a unigram language model. Then, for each symbol in the vocabulary, the algorithm
+computes how much the overall loss would increase if the symbol was to be removed from the vocabulary. Unigram then
+removes p (with p usually being 10% or 20%) percent of the symbols whose loss increase is the lowest, *i.e.* those
+symbols that least affect the overall loss over the training data. This process is repeated until the vocabulary has
+reached the desired size. The Unigram algorithm always keeps the base characters so that any word can be tokenized.

-Contrary to BPE and WordPiece that work out rules in a certain order that you can then apply in the same order when
-tokenizing new text, Unigram will have several ways of tokenizing a new text. For instance, if it ends up with the
-vocabulary
+Because Unigram is not based on merge rules (in contrast to BPE and WordPiece), the algorithm has several ways of
+tokenizing new text after training. As an example, if a trained Unigram tokenizer exhibits the vocabulary:

 .. code-block::

-    ['b', 'g', 'h', 'n', 'p', 's', 'u', 'ug', 'un', 'hug']
+    ["b", "g", "h", "n", "p", "s", "u", "ug", "un", "hug"],

-we had before, it could tokenize "hugs" as ``['hug', 's']``, ``['h', 'ug', 's']`` or ``['h', 'u', 'g', 's']``. So which
-one choose? On top of saving the vocabulary, the trained tokenizer will save the probability of each token in the
-training corpus. You can then give a probability to each tokenization (which is the product of the probabilities of the
-tokens forming it) and pick the most likely one (or if you want to apply some data augmentation, you could sample one
-of the tokenization according to their probabilities).
+``"hugs"`` could be tokenized both as ``["hug", "s"]``, ``["h", "ug", "s"]`` or ``["h", "u", "g", "s"]``. So which one
+to choose? Unigram saves the probability of each token in the training corpus on top of saving the vocabulary so that
+the probability of each possible tokenization can be computed after training. The algorithm simply picks the most
+likely tokenization in practice, but also offers the possibility to sample a possible tokenization according to their
+probabilities.

-Those probabilities define the loss that trains the tokenizer: if our corpus consists of the words :math:`x_{1}, \dots,
-x_{N}` and if for the word :math:`x_{i}` we note :math:`S(x_{i})` the set of all possible tokenizations of
-:math:`x_{i}` (with the current vocabulary), then the loss is defined as
+Those probabilities are defined by the loss the tokenizer is trained on. Assuming that the training data consists of
+the words :math:`x_{1}, \dots, x_{N}` and that the set of all possible tokenizations for a word :math:`x_{i}` is
+defined as :math:`S(x_{i})`, then the overall loss is defined as

 .. math::
    \mathcal{L} = -\sum_{i=1}^{N} \log \left ( \sum_{x \in S(x_{i})} p(x) \right )
@@ -227,15 +259,18 @@ x_{N}` and if for the word :math:`x_{i}` we note :math:`S(x_{i})` the set of all
 SentencePiece
 =======================================================================================================================

-All the methods we have been looking at so far required some form of pretokenization, which has a central problem: not
-all languages use spaces to separate words. This is a problem :doc:`XLM <model_doc/xlm>` solves by using specific
-pretokenizers for each of those languages (in this case, Chinese, Japanese and Thai). To solve this problem,
-SentencePiece (introduced in `this paper <https://arxiv.org/pdf/1808.06226.pdf>`__) treats the input as a raw stream,
-includes the space in the set of characters to use, then uses BPE or unigram to construct the appropriate vocabulary.
+All tokenization algorithms described so far have the same problem: It is assumed that the input text uses spaces to
+separate words. However, not all languages use spaces to separate words. One possible solution is to use language
+specific pre-tokenizers, *e.g.* :doc:`XLM <model_doc/xlm>` uses a specific Chinese, Japanese, and Thai pre-tokenizer).
+To solve this problem more generally, `SentencePiece: A simple and language independent subword tokenizer and
+detokenizer for Neural Text Processing (Kudo et al., 2018) <https://arxiv.org/pdf/1808.06226.pdf>`__ treats the input
+as a raw input stream, thus including the space in the set of characters to use. It then uses the BPE or unigram
+algorithm to construct the appropriate vocabulary.

-That's why in the example we saw before using :class:`~transformers.XLNetTokenizer` (which uses SentencePiece), we had
-the '▁' character, that represents space. Decoding a tokenized text is then super easy: we just have to concatenate all
-of them together and replace '▁' with space.
+The :class:`~transformers.XLNetTokenizer` uses SentencePiece for example, which is also why in the example earlier the
+``"▁"`` character was included in the vocabulary. Decoding with SentencePiece is very easy since all tokens can just be
+concatenated and ``"▁"`` is replaced by a space.

-All transformers models in the library that use SentencePiece use it with unigram. Examples of models using it are
-:doc:`ALBERT <model_doc/albert>`, :doc:`XLNet <model_doc/xlnet>` or the :doc:`Marian framework <model_doc/marian>`.
+All transformers models in the library that use SentencePiece use it in combination with unigram. Examples of models
+using SentencePiece are :doc:`ALBERT <model_doc/albert>`, :doc:`XLNet <model_doc/xlnet>`, :doc:`Marian
+<model_doc/marian>`, and :doc:`T5 <model_doc/t5>`.
--- a/docs/source/training.rst
+++ b/docs/source/training.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Training and fine-tuning
 =======================================================================================================================

@@ -39,7 +51,7 @@ head on top of the encoder with an output size of 2. Models are initialized in `
 .. code-block:: python

    from transformers import BertForSequenceClassification
-    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True)
+    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
    model.train()

 This is useful because it allows us to make use of the pre-trained BERT encoder and easily train it on whatever
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,58 +1,73 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 # Examples

-Version 2.9 of 🤗 Transformers introduced a new [`Trainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) class for PyTorch, and its equivalent [`TFTrainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer_tf.py) for TF 2.
-Running the examples requires PyTorch 1.3.1+ or TensorFlow 2.2+.
-
-Here is the list of all our examples:
- **grouped by task** (all official examples work for multiple models)
- with information on whether they are **built on top of `Trainer`/`TFTrainer`** (if not, they still work, they might
-  just lack some features),
- whether or not they leverage the [🤗 Datasets](https://github.com/huggingface/datasets) library.
- links to **Colab notebooks** to walk through the scripts and run them easily,
- links to **Cloud deployments** to be able to deploy large-scale trainings in the Cloud with little to no setup.
-
+This folder contains actively maintained examples of use of 🤗 Transformers organized along NLP tasks. If you are looking for an example that used to
+be in this folder, it may have moved to our [research projects](https://github.com/huggingface/transformers/tree/master/examples/research_projects) subfolder (which contains frozen snapshots of research projects).

 ## Important note

 **Important**

-To make sure you can successfully run the latest versions of the example scripts, you have to **install the library from source** and install some example-specific requirements.
-Execute the following steps in a new virtual environment:
-
+To make sure you can successfully run the latest versions of the example scripts, you have to **install the library from source** and install some example-specific requirements. To do this, execute the following steps in a new virtual environment:
 ```bash
 git clone https://github.com/huggingface/transformers
 cd transformers
 pip install .
-pip install -r ./examples/requirements.txt
+```
+Then cd in the example folder of your choice and run
+```bash
+pip install -r requirements.txt
 ```

-Alternatively, you can run the version of the examples as they were for your current version of Transformers via (for instance with v3.4.0):
+Alternatively, you can run the version of the examples as they were for your current version of Transformers via (for instance with v3.5.1):
 ```bash
-git checkout tags/v3.4.0
+git checkout tags/v3.5.1
 ```

 ## The Big Table of Tasks

+Here is the list of all our examples:
+- with information on whether they are **built on top of `Trainer`/`TFTrainer`** (if not, they still work, they might
+  just lack some features),
+- whether or not they leverage the [🤗 Datasets](https://github.com/huggingface/datasets) library.
+- links to **Colab notebooks** to walk through the scripts and run them easily,
+<!--
+Coming soon!
+- links to **Cloud deployments** to be able to deploy large-scale trainings in the Cloud with little to no setup.
+-->
+
 | Task | Example datasets | Trainer support | TFTrainer support | 🤗 Datasets | Colab
 |---|---|:---:|:---:|:---:|:---:|
 | [**`language-modeling`**](https://github.com/huggingface/transformers/tree/master/examples/language-modeling)       | Raw text        | ✅ | -  | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb)
-| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/text-classification)   | GLUE, XNLI      | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb)
-| [**`token-classification`**](https://github.com/huggingface/transformers/tree/master/examples/token-classification) | CoNLL NER       | ✅ | ✅ | ✅ | -
 | [**`multiple-choice`**](https://github.com/huggingface/transformers/tree/master/examples/multiple-choice)           | SWAG, RACE, ARC | ✅ | ✅ | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb)
-| [**`question-answering`**](https://github.com/huggingface/transformers/tree/master/examples/question-answering)     | SQuAD           | ✅ | ✅ | - | -
-| [**`text-generation`**](https://github.com/huggingface/transformers/tree/master/examples/text-generation)           | -               | n/a | n/a | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)
-| [**`distillation`**](https://github.com/huggingface/transformers/tree/master/examples/distillation)                 | All             | - | -  | - | -
+| [**`question-answering`**](https://github.com/huggingface/transformers/tree/master/examples/question-answering)     | SQuAD           | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/huggingface/notebooks/blob/master/examples/question_answering.ipynb)
 | [**`summarization`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)                     | CNN/Daily Mail  | ✅  | - | - | -
+| [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/text-classification)   | GLUE, XNLI      | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/huggingface/notebooks/blob/master/examples/text_classification.ipynb)
+| [**`text-generation`**](https://github.com/huggingface/transformers/tree/master/examples/text-generation)           | -               | n/a | n/a | - | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)
+| [**`token-classification`**](https://github.com/huggingface/transformers/tree/master/examples/token-classification) | CoNLL NER       | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb)
 | [**`translation`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)                       | WMT             | ✅  | - | - | -
-| [**`bertology`**](https://github.com/huggingface/transformers/tree/master/examples/bertology)                       | -               | - | - | - | -
-| [**`adversarial`**](https://github.com/huggingface/transformers/tree/master/examples/adversarial)                   | HANS            | ✅ | - | - | -


-<br>
-
+<!--
 ## One-click Deploy to Cloud (wip)

 **Coming soon!**
+-->

 ## Running on TPUs

--- a/examples/_tests_requirements.txt
+++ b/examples/_tests_requirements.txt
@@ -0,0 +1,20 @@
+tensorboard
+scikit-learn
+seqeval
+psutil
+sacrebleu
+rouge-score
+tensorflow_datasets
+matplotlib
+git-python==1.0.3
+faiss-cpu
+streamlit
+elasticsearch
+nltk
+pandas
+datasets >= 1.1.3
+fire
+pytest
+conllu
+sentencepiece != 0.1.92
+protobuf
--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`$PYTHON setup.py install # Python command to install the script.`