Release: v3.2.0

Fixes for LayoutLM (#7318 )
Create an XLA parameter and fix the mixed precision (#7311 )
2020-09-22 17:36:51 +02:00 · 2020-09-22 10:37:11 -04:00 · 2020-09-22 10:19:34 -04:00 · 2020-09-22 10:17:48 -04:00 · 2020-09-22 09:52:29 -04:00 · 2020-09-22 09:34:35 -04:00
1046 changed files with 123663 additions and 31284 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,4 +1,66 @@
-version: 2
+version: 2.1
+orbs:
+    gcp-gke: circleci/gcp-gke@1.0.4
+    go: circleci/go@1.3.0
+
+# TPU REFERENCES
+references:
+    checkout_ml_testing: &checkout_ml_testing
+        run:
+            name: Checkout ml-testing-accelerators
+            command: |
+                git clone https://github.com/GoogleCloudPlatform/ml-testing-accelerators.git
+                cd ml-testing-accelerators
+                git fetch origin 5e88ac24f631c27045e62f0e8d5dfcf34e425e25:stable
+                git checkout stable
+    build_push_docker: &build_push_docker
+        run:
+            name: Configure Docker
+            command: |
+                gcloud --quiet auth configure-docker
+                cd docker/transformers-pytorch-tpu
+                if [ -z "$CIRCLE_PR_NUMBER" ]; then docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" . ; else docker build --tag "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" -f Dockerfile --build-arg "TEST_IMAGE=1" --build-arg "GITHUB_REF=pull/$CIRCLE_PR_NUMBER/head" . ; fi
+                docker push "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID"
+    deploy_cluster: &deploy_cluster
+        run:
+            name: Deploy the job on the kubernetes cluster
+            command: |
+                go get github.com/google/go-jsonnet/cmd/jsonnet && \
+                export PATH=$PATH:$HOME/go/bin && \
+                kubectl create -f docker/transformers-pytorch-tpu/dataset.yaml || true && \
+                job_name=$(jsonnet -J ml-testing-accelerators/ docker/transformers-pytorch-tpu/bert-base-cased.jsonnet --ext-str image=$GCR_IMAGE_PATH --ext-str image-tag=$CIRCLE_WORKFLOW_JOB_ID | kubectl create -f -) && \
+                job_name=${job_name#job.batch/} && \
+                job_name=${job_name% created} && \
+                echo "Waiting on kubernetes job: $job_name" && \
+                i=0 && \
+                # 30 checks spaced 30s apart = 900s total.
+                max_checks=30 && \
+                status_code=2 && \
+                # Check on the job periodically. Set the status code depending on what
+                # happened to the job in Kubernetes. If we try max_checks times and
+                # still the job hasn't finished, give up and return the starting
+                # non-zero status code.
+                while [ $i -lt $max_checks ]; do ((i++)); if kubectl get jobs $job_name -o jsonpath='Failed:{.status.failed}' | grep "Failed:1"; then status_code=1 && break; elif kubectl get jobs $job_name -o jsonpath='Succeeded:{.status.succeeded}' | grep "Succeeded:1" ; then status_code=0 && break; else echo "Job not finished yet"; fi; sleep 30; done && \
+                echo "Done waiting. Job status code: $status_code" && \
+                pod_name=$(kubectl get po -l controller-uid=`kubectl get job $job_name -o "jsonpath={.metadata.labels.controller-uid}"` | awk 'match($0,!/NAME/) {print $1}') && \
+                echo "GKE pod name: $pod_name" && \
+                kubectl logs -f $pod_name --container=train
+                echo "Done with log retrieval attempt." && \
+                gcloud container images delete "$GCR_IMAGE_PATH:$CIRCLE_WORKFLOW_JOB_ID" --force-delete-tags && \
+                exit $status_code
+    delete_gke_jobs: &delete_gke_jobs
+        run:
+            name: Delete GKE Jobs
+            command: |
+                # Match jobs whose age matches patterns like '1h' or '1d', i.e. any job
+                # that has been around longer than 1hr. First print all columns for
+                # matches, then execute the delete.
+                kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $0}'
+                kubectl delete job $(kubectl get job | awk 'match($4,/[0-9]+[dh]/) {print $1}')
+
+
+
+
 jobs:
    run_tests_torch_and_tf:
        working_directory: ~/transformers
@@ -10,11 +72,23 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install .[sklearn,tf-cpu,torch,testing]
-            - run: sudo pip install codecov pytest-cov
-            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
+            - restore_cache:
+                  keys:
+                      - v0.3-torch_and_tf-{{ checksum "setup.py" }}
+                      - v0.3-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install git+https://github.com/huggingface/datasets
+            - run: pip install .[sklearn,tf-cpu,torch,testing]
+            - run: pip install codecov pytest-cov
+            - save_cache:
+                key: v0.3-{{ checksum "setup.py" }}
+                paths:
+                    - '~/.cache/pip'
+            - run: python -m pytest -n 8 --dist=loadfile -rA -s ./tests/ --cov  | tee output.txt
            - run: codecov
-
+            - store_artifacts:
+                  path: ~/transformers/output.txt
+                  destination: test_output.txt
    run_tests_torch:
        working_directory: ~/transformers
        docker:
@@ -25,10 +99,21 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install .[sklearn,torch,testing]
-            - run: sudo pip install codecov pytest-cov
-            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
-            - run: codecov
+            - restore_cache:
+                  keys:
+                      - v0.3-torch-{{ checksum "setup.py" }}
+                      - v0.3-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install git+https://github.com/huggingface/datasets
+            - run: pip install .[sklearn,torch,testing]
+            - save_cache:
+                  key: v0.3-torch-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python -m pytest -n 8 --dist=loadfile -rA -s ./tests/ | tee output.txt
+            - store_artifacts:
+                  path: ~/transformers/output.txt
+                  destination: test_output.txt
    run_tests_tf:
        working_directory: ~/transformers
        docker:
@@ -39,10 +124,21 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install .[sklearn,tf-cpu,testing]
-            - run: sudo pip install codecov pytest-cov
-            - run: python -m pytest -n 8 --dist=loadfile -s -v ./tests/ --cov
-            - run: codecov
+            - restore_cache:
+                  keys:
+                      - v0.3-tf-{{ checksum "setup.py" }}
+                      - v0.3-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install git+https://github.com/huggingface/datasets
+            - run: pip install .[sklearn,tf-cpu,testing]
+            - save_cache:
+                  key: v0.3-tf-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python -m pytest -n 8 --dist=loadfile -rA -s ./tests/ | tee output.txt
+            - store_artifacts:
+               path: ~/transformers/output.txt
+               destination: test_output.txt
    run_tests_custom_tokenizers:
        working_directory: ~/transformers
        docker:
@@ -51,8 +147,21 @@ jobs:
            RUN_CUSTOM_TOKENIZERS: yes
        steps:
            - checkout
-            - run: sudo pip install .[mecab,testing]
-            - run: python -m pytest -sv ./tests/test_tokenization_bert_japanese.py
+            - restore_cache:
+                  keys:
+                      - v0.3-custom_tokenizers-{{ checksum "setup.py" }}
+                      - v0.3-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[ja,testing]
+            - run: python -m unidic download
+            - save_cache:
+                  key: v0.3-custom_tokenizers-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python -m pytest -s ./tests/test_tokenization_bert_japanese.py | tee output.txt
+            - store_artifacts:
+                path: ~/transformers/output.txt
+                destination: test_output.txt
    run_examples_torch:
        working_directory: ~/transformers
        docker:
@@ -63,17 +172,38 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install .[sklearn,torch,testing]
-            - run: sudo pip install -r examples/requirements.txt
-            - run: python -m pytest -n 8 --dist=loadfile -s -v ./examples/
+            - restore_cache:
+                  keys:
+                      - v0.3-torch_examples-{{ checksum "setup.py" }}
+                      - v0.3-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,torch,testing]
+            - run: pip install -r examples/requirements.txt
+            - save_cache:
+                  key: v0.3-torch_examples-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python -m pytest -n 8 --dist=loadfile -rA -s ./examples/ | tee output.txt
+            - store_artifacts:
+                  path: ~/transformers/output.txt
+                  destination: test_output.txt
    build_doc:
        working_directory: ~/transformers
        docker:
            - image: circleci/python:3.6
        steps:
            - checkout
-            - run: sudo pip install .[tf,torch,docs]
-            - run: cd docs && make html
+            - restore_cache:
+                  keys:
+                      - v0.3-build_doc-{{ checksum "setup.py" }}
+                      - v0.3-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[tf,torch,docs]
+            - save_cache:
+                  key: v0.3-build_doc-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: cd docs && make html SPHINXOPTS="-W"
            - store_artifacts:
                path: ./docs/_build
    deploy_doc:
@@ -85,7 +215,15 @@ jobs:
                fingerprints:
                    - "5b:7a:95:18:07:8c:aa:76:4c:60:35:88:ad:60:56:71"
            - checkout
-            - run: sudo pip install .[tf,torch,docs]
+            - restore_cache:
+                  keys:
+                      - v0.3-deploy_doc-{{ checksum "setup.py" }}
+                      - v0.3-{{ checksum "setup.py" }}
+            - run: pip install .[tf,torch,docs]
+            - save_cache:
+                  key: v0.3-deploy_doc-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
            - run: ./.circleci/deploy.sh
    check_code_quality:
        working_directory: ~/transformers
@@ -95,12 +233,22 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            # we need a version of isort with https://github.com/timothycrosley/isort/pull/1000
-            - run: sudo pip install git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
-            - run: sudo pip install .[tf,torch,quality]
+            - restore_cache:
+                  keys:
+                      - v0.3-code_quality-{{ checksum "setup.py" }}
+                      - v0.3-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install isort
+            - run: pip install .[tf,torch,quality]
+            - save_cache:
+                  key: v0.3-code_quality-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
            - run: black --check --line-length 119 --target-version py35 examples templates tests src utils
-            - run: isort --check-only --recursive examples templates tests src utils
+            - run: isort --check-only examples templates tests src utils
            - run: flake8 examples templates tests src utils
+            - run: python utils/check_copies.py
+            - run: python utils/check_repo.py
    check_repository_consistency:
        working_directory: ~/transformers
        docker:
@@ -109,8 +257,37 @@ jobs:
        parallelism: 1
        steps:
            - checkout
-            - run: sudo pip install requests
+            - run: pip install requests
            - run: python ./utils/link_tester.py
+
+# TPU JOBS
+    run_examples_tpu:
+        docker:
+            - image: circleci/python:3.6
+        environment:
+            OMP_NUM_THREADS: 1
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - go/install
+            - *checkout_ml_testing
+            - gcp-gke/install
+            - gcp-gke/update-kubeconfig-with-credentials:
+                  cluster: $GKE_CLUSTER
+                  perform-login: true
+            - setup_remote_docker
+            - *build_push_docker
+            - *deploy_cluster
+    cleanup-gke-jobs:
+        docker:
+            - image: circleci/python:3.6
+        steps:
+            - gcp-gke/install
+            - gcp-gke/update-kubeconfig-with-credentials:
+                  cluster: $GKE_CLUSTER
+                  perform-login: true
+            - *delete_gke_jobs
 workflow_filters: &workflow_filters
    filters:
        branches:
@@ -129,3 +306,15 @@ workflows:
            - run_tests_tf
            - build_doc
            - deploy_doc: *workflow_filters
+    tpu_testing_jobs:
+        triggers:
+            - schedule:
+                # Set to run at the first minute of every hour.
+                cron: "0 8 * * *"
+                filters:
+                    branches:
+                        only:
+                            - master
+        jobs:
+            - cleanup-gke-jobs
+            - run_examples_tpu
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -5,19 +5,31 @@ function deploy_doc(){
 	git checkout $1
 	if [ ! -z "$2" ]
 	then
-		if [ -d "$dir/$2" ]; then
+		if [ "$2" == "master" ]; then
+		    echo "Pushing master"
+			make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir/$2/
+			cp -r _build/html/_static .
+		elif ssh -oStrictHostKeyChecking=no $doc "[ -d $dir/$2 ]"; then
 			echo "Directory" $2 "already exists"
+			scp -r -oStrictHostKeyChecking=no _static/* $doc:$dir/$2/_static/
 		else
 			echo "Pushing version" $2
-			make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
+			make clean && make html
+			rm -rf _build/html/_static
+			cp -r _static _build/html
+			scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
 		fi
 	else
-		echo "Pushing master"
-		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
+		echo "Pushing stable"
+		make clean && make html
+		rm -rf _build/html/_static
+		cp -r _static _build/html
+		scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
 	fi
 }

-deploy_doc "master"
+# You can find the commit for each tag on https://github.com/huggingface/transformers/tags
+deploy_doc "master" master
 deploy_doc "b33a385" v1.0.0
 deploy_doc "fe02e45" v1.1.0
 deploy_doc "89fd345" v1.2.0
@@ -27,3 +39,13 @@ deploy_doc "3616209" v2.2.0
 deploy_doc "d0f8b9a" v2.3.0
 deploy_doc "6664ea9" v2.4.0
 deploy_doc "fb560dc" v2.5.0
+deploy_doc "b90745c" v2.5.1
+deploy_doc "fbc5bf1" v2.6.0
+deploy_doc "6f5a12a" v2.7.0
+deploy_doc "11c3257" v2.8.0
+deploy_doc "e7cfc1a" v2.9.0
+deploy_doc "7cb203f" v2.9.1
+deploy_doc "10d7239" v2.10.0 
+deploy_doc "b42586e" v2.11.0
+deploy_doc "7fb8bdf" v3.0.2
+deploy_doc "4b3ee9c" # v3.1.0 Latest stable release
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -7,14 +7,53 @@ assignees: ''

 ---

-# 🐛 Bug
+
+## Environment info
+<!-- You can run the command `transformers-cli env` and copy-and-paste its output below.
+     Don't forget to fill out the missing fields in that output! -->
+     
+- `transformers` version:
+- Platform:
+- Python version:
+- PyTorch version (GPU?):
+- Tensorflow version (GPU?):
+- Using GPU in script?:
+- Using distributed or parallel set-up in script?:
+
+### Who can help
+<!-- Your issue will be replied to more quickly if you can figure out the right person to tag with @
+ If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
+ Please tag fewer than 3 people.
+ 
+ albert, bert, GPT2, XLM: @LysandreJik 
+ tokenizers: @mfuntowicz
+ Trainer: @sgugger
+ Speed and Memory Benchmarks: @patrickvonplaten
+ Model Cards: @julien-c
+ Translation: @sshleifer
+ Summarization: @sshleifer
+ TextGeneration: @TevenLeScao 
+ examples/distillation: @VictorSanh
+ nlp datasets: [different repo](https://github.com/huggingface/nlp)
+ rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
+ Text Generation: @TevenLeScao
+ blenderbot: @mariamabarham
+ Bart: @sshleifer
+ Marian: @sshleifer
+ T5: @patrickvonplaten
+ Longformer/Reformer: @patrickvonplaten
+ TransfoXL/XLNet: @TevenLeScao 
+ examples/seq2seq: @sshleifer
+ examples/bert-loses-patience: @JetRunner
+ tensorflow: @jplu
+ examples/token-classification: @stefan-it
+ documentation: @sgugger
+ -->

 ## Information

 Model I am using (Bert, XLNet ...):

-Language I am using the model on (English, Chinese ...):
-
 The problem arises when using:
 * [ ] the official example scripts: (give details below)
 * [ ] my own modified scripts: (give details below)
@@ -38,15 +77,3 @@ Steps to reproduce the behavior:
 ## Expected behavior

 <!-- A clear and concise description of what you would expect to happen. -->
-
-## Environment info
-<!-- You can run the command `transformers-cli env` and copy-and-paste its output below.
-     Don't forget to fill out the missing fields in that output! -->
-     
- `transformers` version:
- Platform:
- Python version:
- PyTorch version (GPU?):
- Tensorflow version (GPU?):
- Using GPU in script?:
- Using distributed or parallel set-up in script?:
--- a/.github/ISSUE_TEMPLATE/question-help.md
+++ b/.github/ISSUE_TEMPLATE/question-help.md
@@ -1,6 +1,6 @@
 ---
 name: "❓ Questions & Help"
-about: Post your general questions on Stack Overflow tagged huggingface-transformers
+about: Post your general questions on the Hugging Face forum or Stack Overflow tagged huggingface-transformers
 title: ''
 labels: ''
 assignees: ''
@@ -11,19 +11,17 @@ assignees: ''

 <!-- The GitHub issue tracker is primarly intended for bugs, feature requests,
     new models and benchmarks, and migration questions. For all other questions,
-     we direct you to Stack Overflow (SO) where a whole community of PyTorch and
-     Tensorflow enthusiast can help you out. Make sure to tag your question with the
-     right deep learning framework as well as the huggingface-transformers tag: 
+     we direct you to the Hugging Face forum: https://discuss.huggingface.co/ .
+     You can also try Stack Overflow (SO) where a whole community of PyTorch and
+     Tensorflow enthusiast can help you out. In this case, make sure to tag your
+     question with the right deep learning framework as well as the
+     huggingface-transformers tag: 
     https://stackoverflow.com/questions/tagged/huggingface-transformers 
-     
-     If your question wasn't answered after a period of time on Stack Overflow, you
-     can always open a question on GitHub. You should then link to the SO question 
-     that you posted.
     -->

 ## Details
 <!-- Description of your issue -->

-<!-- You should first ask your question on SO, and only if
+<!-- You should first ask your question on the forum or SO, and only if
     you didn't get an answer ask it here on GitHub. -->
-**A link to original question on Stack Overflow**:
+**A link to original question on the forum/Stack Overflow**:
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -0,0 +1,2 @@
+<!-- This line specifies which issue to close after the pull request is merged. -->
+Fixes #{issue number}
--- a/.github/workflows/github-push.yml
+++ b/.github/workflows/github-push.yml
@@ -1,19 +0,0 @@
-name: GitHub-hosted runner
-
-on: push
-
-jobs:
-  check_code_quality:
-    runs-on: ubuntu-18.04
-    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python
-      uses: actions/setup-python@v1
-      with:
-        python-version: 3.7
-    # - name: Install dependencies
-    #   run: |
-    #     pip install .[tf,torch,quality]
-
-
-
--- a/.github/workflows/github-torch-hub.yml
+++ b/.github/workflows/github-torch-hub.yml
@@ -18,8 +18,17 @@ jobs:
      uses: actions/setup-python@v1
      with:
        python-version: 3.7
+
+    - name: Loading cache
+      uses: actions/cache@v2
+      id: cache
+      with:
+        path: ~/.cache/pip
+        key: v0-torch_hub-${{ hashFiles('setup.py') }}
+
    - name: Install dependencies
      run: |
+        pip install --upgrade pip
        pip install torch
        pip install numpy tokenizers filelock requests tqdm regex sentencepiece sacremoses packaging

--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -25,6 +25,14 @@ jobs:
    - name: Current dir
      run: pwd
    - run: nvidia-smi
+
+    - name: Loading cache.
+      uses: actions/cache@v2
+      id: cache
+      with:
+        path: .env
+        key: v0-tests_tf_torch_gpu-${{ hashFiles('setup.py') }}
+
    - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
      run: |
        python -m venv .env
@@ -35,8 +43,10 @@ jobs:
    - name: Install dependencies
      run: |
        source .env/bin/activate
-        pip install torch
-        pip install .[sklearn,testing]
+        pip install --upgrade pip
+        pip install torch!=1.6.0
+        pip install .[sklearn,testing,onnxruntime]
+        pip install git+https://github.com/huggingface/datasets

    - name: Are GPUs recognized by our DL frameworks
      run: |
@@ -51,4 +61,4 @@ jobs:
        USE_CUDA: yes
      run: |
        source .env/bin/activate
-        python -m pytest -n 2 --dist=loadfile -s -v ./tests/
+        python -m pytest -n 2 --dist=loadfile -s ./tests/
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -13,6 +13,14 @@ jobs:
    runs-on: self-hosted
    steps:
    - uses: actions/checkout@v2
+
+    - name: Loading cache.
+      uses: actions/cache@v2
+      id: cache
+      with:
+        path: .env
+        key: v0-slow_tests_tf_torch_gpu-${{ hashFiles('setup.py') }}
+
    - name: Python version
      run: |
        which python
@@ -22,6 +30,7 @@ jobs:
      run: pwd
    - run: nvidia-smi
    - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+      if: steps.cache.outputs.cache-hit != 'true'
      run: |
        python -m venv .env
        source .env/bin/activate
@@ -31,7 +40,10 @@ jobs:
    - name: Install dependencies
      run: |
        source .env/bin/activate
-        pip install .[sklearn,torch,testing]
+        pip install --upgrade pip
+        pip install torch!=1.6.0
+        pip install .[sklearn,testing,onnxruntime]
+        pip install git+https://github.com/huggingface/datasets

    - name: Are GPUs recognized by our DL frameworks
      run: |
@@ -46,5 +58,15 @@ jobs:
        USE_CUDA: yes
      run: |
        source .env/bin/activate
-        python -m pytest -n 1 --dist=loadfile -s -v ./tests/
-        
+        python -m pytest -n 1 --dist=loadfile -s ./tests/
+
+    - name: Run examples tests on GPU
+      env:
+        TF_FORCE_GPU_ALLOW_GROWTH: "true"
+        OMP_NUM_THREADS: 1
+        RUN_SLOW: yes
+        USE_CUDA: yes
+      run: |
+        source .env/bin/activate
+        pip install -r examples/requirements.txt
+        python -m pytest -n 1 --dist=loadfile -s examples
--- a/.gitignore
+++ b/.gitignore
@@ -8,6 +8,10 @@ __pycache__/
 # C extensions
 *.so

+# tests and logs
+tests/fixtures
+logs/
+
 # Distribution / packaging
 .Python
 build/
@@ -116,6 +120,7 @@ dmypy.json
 .pyre/

 # vscode
+.vs
 .vscode

 # Pycharm
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -65,7 +65,8 @@ Awesome! Please provide the following information:
 If you are willing to contribute the model yourself, let us know so we can best
 guide you.

-We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder.
+We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them
+in the [`templates`](https://github.com/huggingface/transformers/tree/master/templates) folder.

 ### Do you want a new feature (that is not a model)?

@@ -86,7 +87,9 @@ A world-class feature request addresses the following points:
 If your issue is well written we're already 80% of the way there by the time you
 post it.

-We have added **templates** to guide you in the process of adding a new example script for training or testing the models in the library. You can find them in the [`templates`](./templates) folder.
+We have added **templates** to guide you in the process of adding a new example script for training or testing the
+models in the library. You can find them in the [`templates`](https://github.com/huggingface/transformers/tree/master/templates)
+folder.

 ## Start contributing! (Pull Requests)

@@ -131,12 +134,18 @@ Follow these steps to start contributing:
   it with `pip uninstall transformers` before reinstalling it in editable
   mode with the `-e` flag.)

-   Right now, we need an unreleased version of `isort` to avoid a
-   [bug](https://github.com/timothycrosley/isort/pull/1000):
+   To run the full test suite, you might need the additional dependency on `datasets` which requires a separate source
+   install:

   ```bash
-   $ pip install -U git+git://github.com/timothycrosley/isort.git@e63ae06ec7d70b06df9e528357650281a3d3ec22#egg=isort
+   $ git clone https://github.com/huggingface/datasets
+   $ cd datasets
+   $ pip install -e .
   ```
+
+   If you have already cloned that repo, you might need to `git pull` to get the most recent changes in the `datasets`
+   library.
+
 5. Develop the features on your branch.

   As you work on the features, you should make sure that the test suite
@@ -146,6 +155,14 @@ Follow these steps to start contributing:
   $ make test
   ```

+   Note, that this command uses `-n auto` pytest flag, therefore, it will start as many parallel `pytest` processes as the number of your computer's CPU-cores, and if you have lots of those and a few GPUs and not a great amount of RAM, it's likely to overload your computer. Therefore, to run the test suite, you may want to consider using this command instead:
+
+   ```bash
+   $ python -m pytest -n 3 --dist=loadfile -s -v ./tests/
+   ```
+
+   Adjust the value of `-n` to fit the load your hardware can support.
+
   `transformers` relies on `black` and `isort` to format its source code
   consistently. After you make changes, format them with:

@@ -160,6 +177,16 @@ Follow these steps to start contributing:
   $ make quality
   ```

+   If you're modifying documents under `docs/source`, make sure to validate that
+   they can still be built. This check also runs in CI. To run a local check
+   make sure you have installed the documentation builder requirements, by
+   running `pip install .[tf,torch,docs]` once from the root of this repository
+   and then run:
+
+   ```bash
+   $ make docs
+   ```
+
   Once you're happy with your changes, add changed files using `git add` and
   make a commit with `git commit` to record your changes locally:

@@ -205,16 +232,22 @@ Follow these steps to start contributing:
   are useful to avoid duplicated work, and to differentiate it from PRs ready
   to be merged;
 4. Make sure existing tests pass;
-5. Add high-coverage tests. No quality testing = no merge. 
- - If you are adding a new model, make sure that you use `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)`, which triggers the common tests.
- - If you are adding new `@slow` tests, make sure they pass using `RUN_SLOW=1 python -m pytest tests/test_my_new_model.py`. 
- - If you are adding a new tokenizer, write tests, and make sure `RUN_SLOW=1 python -m pytest tests/test_tokenization_{your_model_name}.py` passes.
-CircleCI does not run them. 
-6. All public methods must have informative docstrings that work nicely with sphinx. See `modeling_ctrl.py` for an example.
+5. Add high-coverage tests. No quality testing = no merge.
+   - If you are adding a new model, make sure that you use
+     `ModelTester.all_model_classes = (MyModel, MyModelWithLMHead,...)`, which triggers the common tests.
+   - If you are adding new `@slow` tests, make sure they pass using
+     `RUN_SLOW=1 python -m pytest tests/test_my_new_model.py`.
+   - If you are adding a new tokenizer, write tests, and make sure
+     `RUN_SLOW=1 python -m pytest tests/test_tokenization_{your_model_name}.py` passes.
+   CircleCI does not run the slow tests, but github actions does every night!
+6. All public methods must have informative docstrings that work nicely with sphinx. See `modeling_ctrl.py` for an
+   example.

 ### Tests

-You can run 🤗 Transformers tests with `unittest` or `pytest`.
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in
+the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the
+[examples folder](https://github.com/huggingface/transformers/tree/master/examples).

 We like `pytest` and `pytest-xdist` because it's faster. From the root of the
 repository, here's how to run tests with `pytest` for the library:
@@ -229,8 +262,7 @@ and for the examples:
 $ pip install -r examples/requirements.txt  # only needed the first time
 $ python -m pytest -n auto --dist=loadfile -s -v ./examples/
 ```
-
-In fact, that's how `make test` and `make test-examples` are implemented!
+In fact, that's how `make test` and `make test-examples` are implemented (sans the `pip install` line)!

 You can specify a smaller set of tests in order to test only the feature
 you're working on.
@@ -261,7 +293,8 @@ $ python -m unittest discover -s examples -t examples -v

 ### Style guide

-For documentation strings, `transformers` follows the [google
-style](https://google.github.io/styleguide/pyguide.html).
+For documentation strings, `transformers` follows the [google style](https://google.github.io/styleguide/pyguide.html).
+Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/master/docs#writing-documentation---specification)
+for more information.

 #### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md)
--- a/18
+++ b/18
@@ -1,17 +1,24 @@
-.PHONY: quality style test test-examples
+.PHONY: quality style test test-examples docs

 # Check that source code meets quality standards

 quality:
 	black --check --line-length 119 --target-version py35 examples templates tests src utils
-	isort --check-only --recursive examples templates tests src utils
+	isort --check-only examples templates tests src utils
 	flake8 examples templates tests src utils
+	python utils/check_copies.py
+	python utils/check_repo.py

 # Format source code automatically

 style:
 	black --line-length 119 --target-version py35 examples templates tests src utils
-	isort --recursive examples templates tests src utils
+	isort examples templates tests src utils
+
+# Make marked copies of snippets of codes conform to the original
+
+fix-copies:
+	python utils/check_copies.py --fix_and_overwrite

 # Run tests for the library

@@ -22,3 +29,8 @@ test:

 test-examples:
 	python -m pytest -n auto --dist=loadfile -s -v ./examples/
+
+# Check that docs can build
+
+docs:
+	cd docs && make html SPHINXOPTS="-W"
--- a/README.md
+++ b/README.md
@@ -22,58 +22,125 @@
 <p>State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
 </h3>

-🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides state-of-the-art general-purpose architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet, T5, CTRL...) for Natural Language Understanding (NLU) and Natural Language Generation (NLG) with over thousands of pretrained models in 100+ languages and deep interoperability between PyTorch & TensorFlow 2.0.
+🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation, etc in 100+ languages. Its aim is to make cutting-edge NLP easier to use for everyone. 

+🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets then share them with the community on our [model hub](https://huggingface.co/models). At the same time, each python module defining an architecture can be used as a standalone and modified to enable quick research experiments. 
+
+🤗 Transformers is backed by the two most popular deep learning libraries, [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/), with a seamless integration between them, allowing you to train your models with one then load it for inference with the other.
+
+### Recent contributors
 [![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/0)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/0)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/1)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/1)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/2)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/2)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/3)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/3)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/4)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/4)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/5)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/5)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/6)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/6)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/7)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/7)

-### Features
- High performance on NLU and NLG tasks
- Low barrier to entry for educators and practitioners
+## Online demos

-State-of-the-art NLP for everyone
- Deep learning researchers
- Hands-on practitioners
- AI/ML/NLP teachers and educators
+You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer an [inference API](https://huggingface.co/pricing) to use those models.

-Lower compute costs, smaller carbon footprint
- Researchers can share trained models instead of always retraining
- Practitioners can reduce compute time and production costs
- Dozens of architectures with over 1,000 pretrained models, some in more than 100 languages
+Here are a few examples: 
+- [Masked word completion with BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
+- [Name Entity Recognition with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
+- [Text generation with GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
+- [Natural Langugage Inference with RoBERTa](https://huggingface.co/roberta-large-mnli?text=The+dog+was+lost.+Nobody+lost+any+animal)
+- [Summarization with BART](https://huggingface.co/facebook/bart-large-cnn?text=The+tower+is+324+metres+%281%2C063+ft%29+tall%2C+about+the+same+height+as+an+81-storey+building%2C+and+the+tallest+structure+in+Paris.+Its+base+is+square%2C+measuring+125+metres+%28410+ft%29+on+each+side.+During+its+construction%2C+the+Eiffel+Tower+surpassed+the+Washington+Monument+to+become+the+tallest+man-made+structure+in+the+world%2C+a+title+it+held+for+41+years+until+the+Chrysler+Building+in+New+York+City+was+finished+in+1930.+It+was+the+first+structure+to+reach+a+height+of+300+metres.+Due+to+the+addition+of+a+broadcasting+aerial+at+the+top+of+the+tower+in+1957%2C+it+is+now+taller+than+the+Chrysler+Building+by+5.2+metres+%2817+ft%29.+Excluding+transmitters%2C+the+Eiffel+Tower+is+the+second+tallest+free-standing+structure+in+France+after+the+Millau+Viaduct)
+- [Question answering with DistilBERT](https://huggingface.co/distilbert-base-uncased-distilled-squad?text=Which+name+is+also+used+to+describe+the+Amazon+rainforest+in+English%3F&context=The+Amazon+rainforest+%28Portuguese%3A+Floresta+Amaz%C3%B4nica+or+Amaz%C3%B4nia%3B+Spanish%3A+Selva+Amaz%C3%B3nica%2C+Amazon%C3%ADa+or+usually+Amazonia%3B+French%3A+For%C3%AAt+amazonienne%3B+Dutch%3A+Amazoneregenwoud%29%2C+also+known+in+English+as+Amazonia+or+the+Amazon+Jungle%2C+is+a+moist+broadleaf+forest+that+covers+most+of+the+Amazon+basin+of+South+America.+This+basin+encompasses+7%2C000%2C000+square+kilometres+%282%2C700%2C000+sq+mi%29%2C+of+which+5%2C500%2C000+square+kilometres+%282%2C100%2C000+sq+mi%29+are+covered+by+the+rainforest.+This+region+includes+territory+belonging+to+nine+nations.+The+majority+of+the+forest+is+contained+within+Brazil%2C+with+60%25+of+the+rainforest%2C+followed+by+Peru+with+13%25%2C+Colombia+with+10%25%2C+and+with+minor+amounts+in+Venezuela%2C+Ecuador%2C+Bolivia%2C+Guyana%2C+Suriname+and+French+Guiana.+States+or+departments+in+four+nations+contain+%22Amazonas%22+in+their+names.+The+Amazon+represents+over+half+of+the+planet%27s+remaining+rainforests%2C+and+comprises+the+largest+and+most+biodiverse+tract+of+tropical+rainforest+in+the+world%2C+with+an+estimated+390+billion+individual+trees+divided+into+16%2C000+species)
+- [Translation with T5](https://huggingface.co/t5-base?text=My+name+is+Wolfgang+and+I+live+in+Berlin)

-Choose the right framework for every part of a model's lifetime
- Train state-of-the-art models in 3 lines of code
- Deep interoperability between TensorFlow 2.0 and PyTorch models
- Move a single model between TF2.0/PyTorch frameworks at will
- Seamlessly pick the right framework for training, evaluation, production
+**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team, is the official demo of this repo’s text generation capabilities.

+## Quick tour

-| Section | Description |
-|-|-|
-| [Installation](#installation) | How to install the package |
-| [Model architectures](#model-architectures) | Architectures (with pretrained weights) |
-| [Online demo](#online-demo) | Experimenting with this repo’s text generation capabilities |
-| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
-| [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
-| [Quick tour: pipelines](#quick-tour-of-pipelines) | Using Pipelines: Wrapper around tokenizer and models to use finetuned models |
-| [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
-| [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community |
-| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
-| [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
-| [Documentation][(v2.5.0)](https://huggingface.co/transformers/v2.5.0)[(v2.4.0/v2.4.1)](https://huggingface.co/transformers/v2.4.0)[(v2.3.0)](https://huggingface.co/transformers/v2.3.0)[(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
+To immediately use a model on a given text, we provide the `pipeline` API. Pipelines group together a pretrained model with the preprocessing that was used during that model training. Here is how to quickly use a pipeline to classify positive versus negative texts 
+
+```python
+>>> from transformers import pipeline
+
+# Allocate a pipeline for sentiment-analysis
+>>> classifier = pipeline('sentiment-analysis')
+>>> classifier('We are very happy to include pipeline into the transformers repository.')
+[{'label': 'POSITIVE', 'score': 0.9978193640708923}]
+```
+
+The second line of code downloads and caches the pretrained model used by the pipeline, the third line evaluates it on the given text. Here the answer is "positive" with a confidence of 99.8%. 
+
+This is another example of pipeline used for that can extract question answers from some context:
+
+``` python
+>>> from transformers import pipeline
+
+# Allocate a pipeline for question-answering
+>>> question_answerer = pipeline('question-answering')
+>>> question_answerer({
+...     'question': 'What is the name of the repository ?',
+...     'context': 'Pipeline have been included in the huggingface/transformers repository'
+... })
+{'score': 0.5135612454720828, 'start': 35, 'end': 59, 'answer': 'huggingface/transformers'}
+
+```
+
+On top of the answer, the pretrained model used here returned its confidence score, along with the start position and its end position in the tokenized sentence. You can learn more about the tasks supported by the `pipeline` API in [this tutorial](https://huggingface.co/transformers/task_summary.html).
+
+To download and use any of the pretrained models on your given task, you just need to use those three lines of codes (PyTorch verison):
+```python
+>>> from transformers import AutoTokenizer, AutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = AutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="pt")
+>>> outputs = model(**inputs)
+```
+or for TensorFlow:
+```python
+>>> from transformers import AutoTokenizer, TFAutoModel
+
+>>> tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
+>>> model = TFAutoModel.from_pretrained("bert-base-uncased")
+
+>>> inputs = tokenizer("Hello world!", return_tensors="tf")
+>>> outputs = model(**inputs)
+```
+
+The tokenizer is responsible for all the preprocessing the pretrained model expects, and can be called directly on one (or list) of texts (as we can see on the fourth line of both code examples). It will output a dictionary you can directly pass to your model (which is done on the fifth line).
+
+The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) or a [TensorFlow `tf.keras.Model`](https://www.tensorflow.org/api_docs/python/tf/keras/Model) (depending on your backend) which you can use normally. For instance, [this tutorial](https://huggingface.co/transformers/training.html) explains how to integrate such a model in classic PyTorch or TensorFlow training loop, or how to use our `Trainer` API to quickly fine-tune the on a new dataset.
+
+## Why should I use transformers?
+
+1. Easy-to-use state-of-the-art models:
+    - High performance on NLU and NLG tasks.
+    - Low barrier to entry for educators and practitioners.
+    - Few user-facing abastractions with just three classes to learn.
+    - A unified API for using all our pretrained models.
+
+1. Lower compute costs, smaller carbon footprint:
+    - Researchers can share trained models instead of always retraining.
+    - Practitioners can reduce compute time and production costs.
+    - Dozens of architectures with over 2,000 pretrained models, some in more than 100 languages.
+
+1. Choose the right framework for every part of a model's lifetime:
+    - Train state-of-the-art models in 3 lines of code.
+    - Move a single model between TF2.0/PyTorch frameworks at will.
+    - Seamlessly pick the right framework for training, evaluation, production.
+
+1. Easily customize a model or an example to your needs:
+    - Examples for each architecture to reproduce the results by the official authors of said architecture.
+    - Expose the models internal as consistently as possible.
+    - Model files can be used independently of the library for quick experiments. 
+
+## Why shouldn't I use transformers?
+
+- This library is not a modular toolbox of building blocks for neural nets. The code in the model files is not refactored with additional abstractions on purpose, so that researchers can quickly iterate on each of the models without diving in additional abstractions/files.
+- The training API is not intended to work on any model but is optimized to work with the models provided by the library. For generic machine learning loops, you should use another library.
+- While we strive to present as many use cases as possible, the scripts in our [examples folder](https://github.com/huggingface/transformers/tree/master/examples) are just that: examples. It is expected that they won't work out-of-the box on your specific problem and that you will be required to change a few lines of code to adapt them to your needs.

 ## Installation

-This repo is tested on Python 3.6+, PyTorch 1.0.0+ (PyTorch 1.3.1+ for examples) and TensorFlow 2.0.
+This repository is tested on Python 3.6+, PyTorch 1.0.0+ (PyTorch 1.3.1+ for [examples](https://github.com/huggingface/transformers/tree/master/examples)) and TensorFlow 2.0.

 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).

-Create a virtual environment with the version of Python you're going to use and activate it.
+First, create a virtual environment with the version of Python you're going to use and activate it.

-Now, if you want to use 🤗 Transformers, you can install it with pip. If you'd like to play with the examples, you must install it from source.
-
-### With pip
-
-First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
+Then, you will need to install one of, or both, TensorFlow 2.0 and PyTorch.
 Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.

 When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
@@ -82,68 +149,11 @@ When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be
 pip install transformers
 ```

-### From source
+If you'd like to play with the examples, you must [install the library from source](https://huggingface.co/transformers/installation.html#installing-from-source).

-Here also, you first need to install one of, or both, TensorFlow 2.0 and PyTorch.
-Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
+## Models architectures

-When TensorFlow 2.0 and/or PyTorch has been installed, you can install from source by cloning the repository and running:
-
-```bash
-git clone https://github.com/huggingface/transformers
-cd transformers
-pip install .
-```
-
-When you update the repository, you should upgrade the transformers installation and its dependencies as follows:
-
-```bash
-git pull
-pip install --upgrade .
-```
-
-### Run the examples
-
-Examples are included in the repository but are not shipped with the library.
-
-Therefore, in order to run the latest versions of the examples, you need to install from source, as described above.
-
-Look at the [README](https://github.com/huggingface/transformers/blob/master/examples/README.md) for how to run examples.
-
-### Tests
-
-A series of tests are included for the library and for some example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
-
-Depending on which framework is installed (TensorFlow 2.0 and/or PyTorch), the irrelevant tests will be skipped. Ensure that both frameworks are installed if you want to execute all tests.
-
-Here's the easiest way to run tests for the library:
-
-```bash
-pip install -e ".[testing]"
-make test
-```
-
-and for the examples:
-
-```bash
-pip install -e ".[testing]"
-pip install -r examples/requirements.txt
-make test-examples
-```
-
-For details, refer to the [contributing guide](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#tests).
-
-### Do you want to run a Transformer model on a mobile device?
-
-You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
-
-It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
-
-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models to productizing them in CoreML, or prototype a model or an app in CoreML then research its hyperparameters or architecture from TensorFlow 2.0 and/or PyTorch. Super exciting!
-
-## Model architectures
-
-🤗 Transformers currently provides the following NLU/NLG architectures:
+🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/transformers/model_summary.html) for a high-level summary of each them):

 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 2. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
@@ -166,529 +176,39 @@ At some point in the future, you'll be able to seamlessly move from pre-training
 19. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 20. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
 21. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-22. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
-23. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
-
-These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
-
-## Online demo
-
-**[Write With Transformer](https://transformer.huggingface.co)**, built by the Hugging Face team at transformer.huggingface.co, is the official demo of this repo’s text generation capabilities.
-You can use it to experiment with completions generated by `GPT2Model`, `TransfoXLModel`, and `XLNetModel`.
-
-> “🦄 Write with transformer is to writing what calculators are to calculus.”
-
-![write_with_transformer](https://transformer.huggingface.co/front/assets/thumbnail-large.png)
-
-## Quick tour
-
-Let's do a very quick overview of the model architectures in 🤗 Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/transformers/).
-
-```python
-import torch
-from transformers import *
-
-# Transformers has a unified API
-# for 10 transformer architectures and 30 pretrained weights.
-#          Model          | Tokenizer          | Pretrained weights shortcut
-MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
-          (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
-          (GPT2Model,       GPT2Tokenizer,       'gpt2'),
-          (CTRLModel,       CTRLTokenizer,       'ctrl'),
-          (TransfoXLModel,  TransfoXLTokenizer,  'transfo-xl-wt103'),
-          (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
-          (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
-          (DistilBertModel, DistilBertTokenizer, 'distilbert-base-cased'),
-          (RobertaModel,    RobertaTokenizer,    'roberta-base'),
-          (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
-         ]
-
-# To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`
-
-# Let's encode some text in a sequence of hidden-states using each model:
-for model_class, tokenizer_class, pretrained_weights in MODELS:
-    # Load pretrained model/tokenizer
-    tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
-    model = model_class.from_pretrained(pretrained_weights)
-
-    # Encode text
-    input_ids = torch.tensor([tokenizer.encode("Here is some text to encode", add_special_tokens=True)])  # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
-    with torch.no_grad():
-        last_hidden_states = model(input_ids)[0]  # Models outputs are now tuples
-
-# Each architecture is provided with several class for fine-tuning on down-stream tasks, e.g.
-BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNextSentencePrediction,
-                      BertForSequenceClassification, BertForTokenClassification, BertForQuestionAnswering]
-
-# All the classes for an architecture can be initiated from pretrained weights for this architecture
-# Note that additional weights added for fine-tuning are only initialized
-# and need to be trained on the down-stream task
-pretrained_weights = 'bert-base-uncased'
-tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
-for model_class in BERT_MODEL_CLASSES:
-    # Load pretrained model/tokenizer
-    model = model_class.from_pretrained(pretrained_weights)
-
-    # Models can return full list of hidden-states & attentions weights at each layer
-    model = model_class.from_pretrained(pretrained_weights,
-                                        output_hidden_states=True,
-                                        output_attentions=True)
-    input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
-    all_hidden_states, all_attentions = model(input_ids)[-2:]
-
-    # Models are compatible with Torchscript
-    model = model_class.from_pretrained(pretrained_weights, torchscript=True)
-    traced_model = torch.jit.trace(model, (input_ids,))
-
-    # Simple serialization for models and tokenizers
-    model.save_pretrained('./directory/to/save/')  # save
-    model = model_class.from_pretrained('./directory/to/save/')  # re-load
-    tokenizer.save_pretrained('./directory/to/save/')  # save
-    tokenizer = BertTokenizer.from_pretrained('./directory/to/save/')  # re-load
-
-    # SOTA examples for GLUE, SQUAD, text generation...
-```
-
-## Quick tour TF 2.0 training and PyTorch interoperability
-
-Let's do a quick example of how a TensorFlow 2.0 model can be trained in 12 lines of code with 🤗 Transformers and then loaded in PyTorch for fast inspection/tests.
-
-```python
-import tensorflow as tf
-import tensorflow_datasets
-from transformers import *
-
-# Load dataset, tokenizer, model from pretrained model/vocabulary
-tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
-data = tensorflow_datasets.load('glue/mrpc')
-
-# Prepare dataset for GLUE as a tf.data.Dataset instance
-train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
-valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, max_length=128, task='mrpc')
-train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
-valid_dataset = valid_dataset.batch(64)
-
-# Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule
-optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
-loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
-model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
-
-# Train and evaluate using tf.keras.Model.fit()
-history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
-                    validation_data=valid_dataset, validation_steps=7)
-
-# Load the TensorFlow model in PyTorch for inspection
-model.save_pretrained('./save/')
-pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
-
-# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
-sentence_0 = "This research was consistent with his findings."
-sentence_1 = "His findings were compatible with this research."
-sentence_2 = "His findings were not compatible with this research."
-inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
-inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
-
-pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item()
-pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item()
-
-print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
-print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
-```
-
-## Quick tour of the fine-tuning/usage scripts
-
-**Important**
-Before running the fine-tuning scripts, please read the
-[instructions](#run-the-examples) on how to
-setup your environment to run the examples.
-
-The library comprises several example scripts with SOTA performances for NLU and NLG tasks:
-
- `run_glue.py`: an example fine-tuning sequence classification models on nine different GLUE tasks (*sequence-level classification*)
- `run_squad.py`: an example fine-tuning question answering models on the question answering dataset SQuAD 2.0 (*token-level classification*)
- `run_ner.py`: an example fine-tuning token classification models on named entity recognition (*token-level classification*)
- `run_generation.py`: an example using GPT, GPT-2, CTRL, Transformer-XL and XLNet for conditional language generation
- other model-specific examples (see the documentation).
-
-Here are three quick usage examples for these scripts:
-
-### `run_glue.py`: Fine-tuning on GLUE tasks for sequence classification
-
-The [General Language Understanding Evaluation (GLUE) benchmark](https://gluebenchmark.com/) is a collection of nine sentence- or sentence-pair language understanding tasks for evaluating and analyzing natural language understanding systems.
-
-Before running any of these GLUE tasks you should download the
-[GLUE data](https://gluebenchmark.com/tasks) by running
-[this script](https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
-and unpack it to some directory `$GLUE_DIR`.
-
-You should also install the additional packages required by the examples:
-
-```shell
-pip install -r ./examples/requirements.txt
-```
-
-```shell
-export GLUE_DIR=/path/to/glue
-export TASK_NAME=MRPC
-
-python ./examples/text-classification/run_glue.py \
-    --model_name_or_path bert-base-uncased \
-    --task_name $TASK_NAME \
-    --do_train \
-    --do_eval \
-    --data_dir $GLUE_DIR/$TASK_NAME \
-    --max_seq_length 128 \
-    --per_device_eval_batch_size=8   \
-    --per_device_train_batch_size=8   \
-    --learning_rate 2e-5 \
-    --num_train_epochs 3.0 \
-    --output_dir /tmp/$TASK_NAME/
-```
-
-where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
-
-The dev set results will be present within the text file 'eval_results.txt' in the specified output_dir. In case of MNLI, since there are two separate dev sets, matched and mismatched, there will be a separate output folder called '/tmp/MNLI-MM/' in addition to '/tmp/MNLI/'.
-
-#### Fine-tuning XLNet model on the STS-B regression task
-
-This example code fine-tunes XLNet on the STS-B corpus using parallel training on a server with 4 V100 GPUs.
-Parallel training is a simple way to use several GPUs (but is slower and less flexible than distributed training, see below).
-
-```shell
-export GLUE_DIR=/path/to/glue
-
-python ./examples/text-classification/run_glue.py \
-    --model_name_or_path xlnet-large-cased \
-    --do_train  \
-    --do_eval   \
-    --task_name=sts-b     \
-    --data_dir=${GLUE_DIR}/STS-B  \
-    --output_dir=./proc_data/sts-b-110   \
-    --max_seq_length=128   \
-    --per_device_eval_batch_size=8   \
-    --per_device_train_batch_size=8   \
-    --gradient_accumulation_steps=1 \
-    --max_steps=1200  \
-    --model_name=xlnet-large-cased   \
-    --overwrite_output_dir   \
-    --overwrite_cache \
-    --warmup_steps=120
-```
-
-On this machine we thus have a batch size of 32, please increase `gradient_accumulation_steps` to reach the same batch size if you have a smaller machine. These hyper-parameters should result in a Pearson correlation coefficient of `+0.917` on the development set.
-
-#### Fine-tuning Bert model on the MRPC classification task
-
-This example code fine-tunes the Bert Whole Word Masking model on the Microsoft Research Paraphrase Corpus (MRPC) corpus using distributed training on 8 V100 GPUs to reach a F1 > 92.
-
-```bash
-python -m torch.distributed.launch --nproc_per_node 8 ./examples/text-classification/run_glue.py   \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --task_name MRPC \
-    --do_train   \
-    --do_eval   \
-    --data_dir $GLUE_DIR/MRPC/   \
-    --max_seq_length 128   \
-    --per_device_eval_batch_size=8   \
-    --per_device_train_batch_size=8   \
-    --learning_rate 2e-5   \
-    --num_train_epochs 3.0  \
-    --output_dir /tmp/mrpc_output/ \
-    --overwrite_output_dir   \
-    --overwrite_cache \
-```
-
-Training with these hyper-parameters gave us the following results:
-
-```bash
-  acc = 0.8823529411764706
-  acc_and_f1 = 0.901702786377709
-  eval_loss = 0.3418912578906332
-  f1 = 0.9210526315789473
-  global_step = 174
-  loss = 0.07231863956341798
-```
-
-### `run_squad.py`: Fine-tuning on SQuAD for question-answering
-
-This example code fine-tunes BERT on the SQuAD dataset using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
-
-```bash
-python -m torch.distributed.launch --nproc_per_node=8 ./examples/question-answering/run_squad.py \
-    --model_type bert \
-    --model_name_or_path bert-large-uncased-whole-word-masking \
-    --do_train \
-    --do_eval \
-    --train_file $SQUAD_DIR/train-v1.1.json \
-    --predict_file $SQUAD_DIR/dev-v1.1.json \
-    --learning_rate 3e-5 \
-    --num_train_epochs 2 \
-    --max_seq_length 384 \
-    --doc_stride 128 \
-    --output_dir ../models/wwm_uncased_finetuned_squad/ \
-    --per_device_eval_batch_size=3   \
-    --per_device_train_batch_size=3   \
-```
-
-Training with these hyper-parameters gave us the following results:
-
-```bash
-python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
-{"exact_match": 86.91579943235573, "f1": 93.1532499015869}
-```
-
-This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-squad`.
-
-### `run_generation.py`: Text generation with GPT, GPT-2, CTRL, Transformer-XL and XLNet
-
-A conditional generation script is also included to generate text from a prompt.
-The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high-quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
-
-Here is how to run the script with the small version of OpenAI GPT-2 model:
-
-```shell
-python ./examples/text-generation/run_generation.py \
-    --model_type=gpt2 \
-    --length=20 \
-    --model_name_or_path=gpt2 \
-```
-
-and from the Salesforce CTRL model:
-```shell
-python ./examples/text-generation/run_generation.py \
-    --model_type=ctrl \
-    --length=20 \
-    --model_name_or_path=ctrl \
-    --temperature=0 \
-    --repetition_penalty=1.2 \
-```
-
-## Quick tour of model sharing
-
-Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
-
-**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Optionally, join an existing organization or create a new one. Then:
-
-```shell
-transformers-cli login
-# log in using the same credentials as on huggingface.co
-```
-Upload your model:
-```shell
-transformers-cli upload ./path/to/pretrained_model/
-
-# ^^ Upload folder containing weights/tokenizer/config
-# saved via `.save_pretrained()`
-
-transformers-cli upload ./config.json [--filename folder/foobar.json]
-
-# ^^ Upload a single file
-# (you can optionally override its filename, which can be nested inside a folder)
-```
-
-If you want your model to be namespaced by your organization name rather than your username, add the following flag to any command:
-```shell
--organization organization_name
-```
-
-Your model will then be accessible through its identifier, a concatenation of your username (or organization name) and the folder name above:
-```python
-"username/pretrained_model"
-# or if an org:
-"organization_name/pretrained_model"
-```
-
-**Please add a README.md model card** to the repo under `model_cards/` with: model description, training params (dataset, preprocessing, hardware used, hyperparameters), evaluation results, intended uses & limitations, etc.
-
-Your model now has a page on huggingface.co/models 🔥
-
-Anyone can load it from code:
-```python
-tokenizer = AutoTokenizer.from_pretrained("namespace/pretrained_model")
-model = AutoModel.from_pretrained("namespace/pretrained_model")
-```
-
-List all your files on S3:
-```shell
-transformers-cli s3 ls
-```
-
-You can also delete unneeded files:
-
-```shell
-transformers-cli s3 rm …
-```
-
-## Quick tour of pipelines
-
-New in version `v2.3`: `Pipeline` are high-level objects which automatically handle tokenization, running your data through a transformers model
-and outputting the result in a structured object.
-
-You can create `Pipeline` objects for the following down-stream tasks:
-
- - `feature-extraction`: Generates a tensor representation for the input sequence
- - `ner`: Generates named entity mapping for each word in the input sequence.
- - `sentiment-analysis`: Gives the polarity (positive / negative) of the whole input sequence.
- - `text-classification`: Initialize a `TextClassificationPipeline` directly, or see `sentiment-analysis` for an example.
- - `question-answering`: Provided some context and a question refering to the context, it will extract the answer to the question in the context.
- - `fill-mask`: Takes an input sequence containing a masked token (e.g. `<mask>`) and return list of most probable filled sequences, with their probabilities.
- - `summarization`
- - `translation_xx_to_yy`
-
-```python
-from transformers import pipeline
-
-# Allocate a pipeline for sentiment-analysis
-nlp = pipeline('sentiment-analysis')
-nlp('We are very happy to include pipeline into the transformers repository.')
->>> {'label': 'POSITIVE', 'score': 0.99893874}
-
-# Allocate a pipeline for question-answering
-nlp = pipeline('question-answering')
-nlp({
-    'question': 'What is the name of the repository ?',
-    'context': 'Pipeline have been included in the huggingface/transformers repository'
-})
->>> {'score': 0.28756016668193496, 'start': 35, 'end': 59, 'answer': 'huggingface/transformers'}
-```
-
-## Migrating from pytorch-transformers to transformers
-
-Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
-
-### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed
-
-To be able to use Torchscript (see #1010, #1204 and #1195) the specific order of some models **keywords inputs** (`attention_mask`, `token_type_ids`...) has been changed.
-
-If you used to call the models with keyword names for keyword arguments, e.g. `model(inputs_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)`, this should not cause any change.
-
-If you used to call the models with positional inputs for keyword arguments, e.g. `model(inputs_ids, attention_mask, token_type_ids)`, you may have to double check the exact order of input arguments.
-
-
-## Migrating from pytorch-pretrained-bert to transformers
-
-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`.
-
-### Models always output `tuples`
-
-The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that every model's forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
-
-The exact content of the tuples for each model is detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
-
-In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
-
-Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
-
-```python
-# Let's load our model
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-# If you used to have this line in pytorch-pretrained-bert:
-loss = model(input_ids, labels=labels)
-
-# Now just use this line in transformers to extract the loss from the output tuple:
-outputs = model(input_ids, labels=labels)
-loss = outputs[0]
-
-# In transformers you can also have access to the logits:
-loss, logits = outputs[:2]
-
-# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
-outputs = model(input_ids, labels=labels)
-loss, logits, attentions = outputs
-```
-
-### Using hidden states
-
-By enabling the configuration option `output_hidden_states`, it was possible to retrieve the last hidden states of the encoder. In `pytorch-transformers` as well as `transformers` the return value has changed slightly: `all_hidden_states` now also includes the hidden state of the embeddings in addition to those of the encoding layers. This allows users to easily access the embeddings final state.
-
-### Serialization
-
-Breaking change in the `from_pretrained()` method:
-
-1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them, don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
-
-2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead, which can break derived model classes built based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/transformers/pull/866) by forwarding the the model's `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
-
-Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
-
-Here is an example:
-
-```python
-### Let's load a model and tokenizer
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-
-### Do some stuff to our model and tokenizer
-# Ex: add new tokens to the vocabulary and embeddings of our model
-tokenizer.add_tokens(['[SPECIAL_TOKEN_1]', '[SPECIAL_TOKEN_2]'])
-model.resize_token_embeddings(len(tokenizer))
-# Train our model
-train(model)
-
-### Now let's save our model and tokenizer to a directory
-model.save_pretrained('./my_saved_model_directory/')
-tokenizer.save_pretrained('./my_saved_model_directory/')
-
-### Reload the model and the tokenizer
-model = BertForSequenceClassification.from_pretrained('./my_saved_model_directory/')
-tokenizer = BertTokenizer.from_pretrained('./my_saved_model_directory/')
-```
-
-### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules
-
-The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer which has a few differences:
-
- it only implements weights decay correction,
- schedules are now externals (see below),
- gradient clipping is now also external (see below).
-
-The new optimizer `AdamW` matches PyTorch `Adam` optimizer API and let you use standard PyTorch or apex methods for the schedule and clipping.
-
-The schedules are now standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) and not part of the optimizer anymore.
-
-Here is a conversion examples from `BertAdam` with a linear warmup and decay schedule to `AdamW` and the same schedule:
-
-```python
-# Parameters:
-lr = 1e-3
-max_grad_norm = 1.0
-num_training_steps = 1000
-num_warmup_steps = 100
-warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1
-
-### Previously BertAdam optimizer was instantiated like this:
-optimizer = BertAdam(model.parameters(), lr=lr, schedule='warmup_linear', warmup=warmup_proportion, t_total=num_training_steps)
-### and used like this:
-for batch in train_data:
-    loss = model(batch)
-    loss.backward()
-    optimizer.step()
-
-### In Transformers, optimizer and schedules are splitted and instantiated like this:
-optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
-scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler
-### and used like this:
-for batch in train_data:
-    model.train()
-    loss = model(batch)
-    loss.backward()
-    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
-    optimizer.step()
-    scheduler.step()
-    optimizer.zero_grad()
-```
+22. **[DPR](https://github.com/facebookresearch/DPR)** (from Facebook) released with the paper [Dense Passage Retrieval
+for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
+Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+23. **[Pegasus](https://github.com/google-research/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+24. **[MBart](https://github.com/pytorch/fairseq/tree/master/examples/mbart)** (from Facebook) released with the paper  [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.  
+25. **[LXMERT](https://github.com/airsplay/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+26. **[Funnel Transformer](https://github.com/laiguokun/Funnel-Transformer)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+27. **[LayoutLM](https://github.com/microsoft/unilm/tree/master/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+28. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
+29. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+
+These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations. You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
+
+
+## Learn more
+
+| Section | Description |
+|-|-|
+| [Documentation](https://huggingface.co/transformers/) | Full API documentation and tutorials |
+| [Task summary](https://huggingface.co/transformers/task_summary.html) | Tasks supported by 🤗 Transformers |
+| [Preprocessing tutorial](https://huggingface.co/transformers/preprocessing.html) | Using the `Tokenizer` class to prepare data for the models |
+| [Training and fine-tuning](https://huggingface.co/transformers/training.html) | Using the models provided by 🤗 Transformers in a PyTorch/TensorFlow training loop and the `Trainer` API |
+| [Quick tour: Fine-tuning/usage scripts](https://github.com/huggingface/transformers/tree/master/examples) | Example scripts for fine-tuning models on a wide range of tasks |
+| [Model sharing and uploading](https://huggingface.co/transformers/model_sharing.html) | Upload and share your fine-tuned models with the community |
+| [Migration](https://huggingface.co/transformers/migration.html) | Migrate to 🤗 Transformers from `pytorch-transformers` or `pytorch-pretrained-bert` |

 ## Citation

-We now have a paper you can cite for the 🤗 Transformers library:
+We now have a [paper](https://arxiv.org/abs/1910.03771) you can cite for the 🤗 Transformers library:
 ```bibtex
@article{Wolf2019HuggingFacesTS,
  title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
-  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R'emi Louf and Morgan Funtowicz and Jamie Brew},
+  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush},
  journal={ArXiv},
  year={2019},
  volume={abs/1910.03771}
--- a/codecov.yml
+++ b/codecov.yml
@@ -0,0 +1,10 @@
+coverage:
+  status:
+    project:
+      default:
+        informational: true
+    patch: off
+comment:
+  require_changes: true    # only comment if there was change in coverage
+  require_head: yes        # don't report if there is no head coverage report
+  require_base: yes        # don't report if there is no base coverage report
--- a/deploy_multi_version_doc.sh
+++ b/deploy_multi_version_doc.sh
@@ -1,23 +0,0 @@
-cd docs
-
-function deploy_doc(){
-	echo "Creating doc at commit $1 and pushing to folder $2"
-	git checkout $1
-	if [ ! -z "$2" ] 
-	then
-		echo "Pushing version" $2
-		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
-	else
-		echo "Pushing master"
-		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
-	fi
-}
-
-deploy_doc "master" 
-deploy_doc "b33a385" v1.0.0
-deploy_doc "fe02e45" v1.1.0
-deploy_doc "89fd345" v1.2.0
-deploy_doc "fc9faa8" v2.0.0
-deploy_doc "3ddce1d" v2.1.1
-deploy_doc "f2f3294" v2.2.0
-deploy_doc "d0f8b9a" v2.3.0
--- a/docker/transformers-pytorch-tpu/Dockerfile
+++ b/docker/transformers-pytorch-tpu/Dockerfile
@@ -0,0 +1,65 @@
+FROM google/cloud-sdk:slim
+
+# Build args.
+ARG GITHUB_REF=refs/heads/master
+
+# TODO: This Dockerfile installs pytorch/xla 3.6 wheels. There are also 3.7
+# wheels available; see below.
+ENV PYTHON_VERSION=3.6
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+         build-essential \
+         cmake \
+         git \
+         curl \
+         ca-certificates
+
+# Install conda and python.
+# NOTE new Conda does not forward the exit status... https://github.com/conda/conda/issues/8385
+RUN curl -o ~/miniconda.sh https://repo.anaconda.com/miniconda/Miniconda3-4.7.12-Linux-x86_64.sh  && \
+    chmod +x ~/miniconda.sh && \
+    ~/miniconda.sh -b && \
+    rm ~/miniconda.sh
+
+ENV PATH=/root/miniconda3/bin:$PATH
+
+RUN conda create -y --name container python=$PYTHON_VERSION
+
+# Run the rest of commands within the new conda env.
+# Use absolute path to appease Codefactor.
+SHELL ["/root/miniconda3/bin/conda", "run", "-n", "container", "/bin/bash", "-c"]
+RUN conda install -y python=$PYTHON_VERSION mkl
+
+RUN pip uninstall -y torch && \
+    # Python 3.7 wheels are available. Replace cp36-cp36m with cp37-cp37m
+    gsutil cp 'gs://tpu-pytorch/wheels/torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
+    gsutil cp 'gs://tpu-pytorch/wheels/torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
+    gsutil cp 'gs://tpu-pytorch/wheels/torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' . && \
+    pip install 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    pip install 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    pip install 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    rm 'torch-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    rm 'torch_xla-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    rm 'torchvision-nightly-cp${PYTHON_VERSION/./}-cp${PYTHON_VERSION/./}m-linux_x86_64.whl' && \
+    apt-get install -y libomp5
+
+ENV LD_LIBRARY_PATH=root/miniconda3/envs/container/lib
+
+
+# Install huggingface/transformers at the current PR, plus dependencies.
+RUN git clone https://github.com/huggingface/transformers.git && \
+    cd transformers && \
+    git fetch origin $GITHUB_REF:CI && \
+    git checkout CI && \
+    cd .. && \
+    pip install ./transformers && \
+    pip install -r ./transformers/examples/requirements.txt && \
+    pip install pytest
+
+RUN python -c "import torch_xla; print(torch_xla.__version__)"
+RUN python -c "import transformers as trf; print(trf.__version__)"
+RUN conda init bash
+COPY docker-entrypoint.sh /usr/local/bin/
+RUN chmod +x /usr/local/bin/docker-entrypoint.sh
+ENTRYPOINT ["/usr/local/bin/docker-entrypoint.sh"]
+CMD ["bash"]
--- a/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet
+++ b/docker/transformers-pytorch-tpu/bert-base-cased.jsonnet
@@ -0,0 +1,38 @@
+local base = import 'templates/base.libsonnet';
+local tpus = import 'templates/tpus.libsonnet';
+local utils = import "templates/utils.libsonnet";
+local volumes = import "templates/volumes.libsonnet";
+
+local bertBaseCased = base.BaseTest {
+  frameworkPrefix: "hf",
+  modelName: "bert-base-cased",
+  mode: "example",
+  configMaps: [],
+
+  timeout: 3600, # 1 hour, in seconds
+
+  image: std.extVar('image'),
+  imageTag: std.extVar('image-tag'),
+
+  tpuSettings+: {
+    softwareVersion: "pytorch-nightly",
+  },
+  accelerator: tpus.v3_8,
+
+  volumeMap+: {
+    datasets: volumes.PersistentVolumeSpec {
+      name: "huggingface-cluster-disk",
+      mountPath: "/datasets",
+    },
+  },
+  command: utils.scriptCommand(
+    |||
+      python -m pytest -s transformers/examples/test_xla_examples.py -v
+      test_exit_code=$?
+      echo "\nFinished running commands.\n"
+      test $test_exit_code -eq 0
+    |||
+  ),
+};
+
+bertBaseCased.oneshotJob
--- a/docker/transformers-pytorch-tpu/dataset.yaml
+++ b/docker/transformers-pytorch-tpu/dataset.yaml
@@ -0,0 +1,32 @@
+apiVersion: v1
+kind: PersistentVolume
+metadata:
+  name: huggingface-cluster-disk
+spec:
+  storageClassName: ""
+  capacity:
+    storage: 500Gi
+  accessModes:
+    - ReadOnlyMany
+  claimRef:
+    namespace: default
+    name: huggingface-cluster-disk-claim
+  gcePersistentDisk:
+    pdName: huggingface-cluster-disk
+    fsType: ext4
+    readOnly: true
+---
+apiVersion: v1
+kind: PersistentVolumeClaim
+metadata:
+  name: huggingface-cluster-disk-claim
+spec:
+  # Specify "" as the storageClassName so it matches the PersistentVolume's StorageClass.
+  # A nil storageClassName value uses the default StorageClass. For details, see
+  # https://kubernetes.io/docs/concepts/storage/persistent-volumes/#class-1
+  storageClassName: ""
+  accessModes:
+    - ReadOnlyMany
+  resources:
+    requests:
+      storage: 1Ki
--- a/docker/transformers-pytorch-tpu/docker-entrypoint.sh
+++ b/docker/transformers-pytorch-tpu/docker-entrypoint.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+source ~/.bashrc
+echo "running docker-entrypoint.sh"
+conda activate container
+echo $KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS
+echo "printed TPU info"
+export XRT_TPU_CONFIG="tpu_worker;0;${KUBE_GOOGLE_CLOUD_TPU_ENDPOINTS:7}"
+exec "$@"#!/bin/bash
--- a/docs/README.md
+++ b/docs/README.md
@@ -7,6 +7,14 @@ you can install them with the following command, at the root of the code reposit
 pip install -e ".[docs]"
 ```

+---
+**NOTE**
+
+You only need to generate the documentation to inspect it locally (if you're planning changes and want to 
+check how they look like before committing for instance). You don't have to commit the built documentation.
+
+---
+
 ## Packages installed

 Here's an overview of all the packages installed. If you ran the previous command installing all packages from
@@ -34,20 +42,14 @@ pip install recommonmark

 ## Building the documentation

-Make sure that there is a symlink from the `example` file (in /examples) inside the source folder. Run the following
-command to generate it:
-
-```bash
-ln -s ../../examples/README.md examples.md
-```
-
 Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:

 ```bash
 make html
 ```

-A folder called ``_build/html`` should have been created. You can now open the file ``_build/html/index.html`` in your browser. 
+A folder called ``_build/html`` should have been created. You can now open the file ``_build/html/index.html`` in your
+browser. 

 ---
 **NOTE**
@@ -68,6 +70,18 @@ It should build the static app that will be available under `/docs/_build/html`
 Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
 in the source directory. You can then link it to the toc-tree by putting the filename without the extension.

+## Preview the documentation in a pull request
+
+Once you have made your pull request, you can check what the documentation will look like after it's merged by
+following these steps:
+
+- Look at the checks at the bottom of the conversation page of your PR (you may need to click on "show all checks" to
+  expand them).
+- Click on "details" next to the `ci/circleci: build_doc` check.
+- In the new window, click on the "Artifacts" tab.
+- Locate the file "docs/_build/html/index.html" (or any specific page you want to check) and click on it to get a 
+  preview.
+
 ## Writing Documentation - Specification

 The `huggingface/transformers` documentation follows the
@@ -112,8 +126,8 @@ XXXConfig
    :members:
 ```

-This will include every public method of the configuration. If for some reason you wish for a method not to be displayed
-in the documentation, you can do so by specifying which methods should be in the docs:
+This will include every public method of the configuration. If for some reason you wish for a method not to be
+displayed in the documentation, you can do so by specifying which methods should be in the docs:

 ```
 XXXTokenizer
@@ -127,8 +141,8 @@ XXXTokenizer

 ### Writing source documentation

-Values that should be put in `code` should either be surrounded by double backticks: \`\`like so\`\` or be written as an object
-using the :obj: syntax: :obj:\`like so\`.
+Values that should be put in `code` should either be surrounded by double backticks: \`\`like so\`\` or be written as
+an object using the :obj: syntax: :obj:\`like so\`.

 When mentionning a class, it is recommended to use the :class: syntax as the mentioned class will be automatically
 linked by Sphinx: :class:\`transformers.XXXClass\`
@@ -153,7 +167,7 @@ Here's an example showcasing everything so far:

            Indices can be obtained using :class:`transformers.AlbertTokenizer`.
            See :func:`transformers.PreTrainedTokenizer.encode` and
-            :func:`transformers.PreTrainedTokenizer.encode_plus` for details.
+            :func:`transformers.PreTrainedTokenizer.__call__` for details.

            `What are input IDs? <../glossary.html#input-ids>`__
 ```
--- a/docs/source/_static/css/code-snippets.css
+++ b/docs/source/_static/css/code-snippets.css
@@ -9,4 +9,8 @@

 .highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
    color: #6670FF;
+}
+
+.highlight .gp {
+    color: #FB8D68;
 }
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -1,9 +1,81 @@
 /* Our DOM objects */

+/* Colab dropdown */
+
+.colab-dropdown {
+    position: relative;
+    display: inline-block;
+}
+  
+.colab-dropdown-content {
+    display: none;
+    position: absolute;
+    background-color: #f9f9f9;
+    min-width: 117px;
+    box-shadow: 0px 8px 16px 0px rgba(0,0,0,0.2);
+    z-index: 1;
+}
+  
+.colab-dropdown-content button {
+    color: #6670FF;
+    background-color: #f9f9f9;
+    font-size: 12px;
+    border: none;
+    min-width: 117px;
+    padding: 5px 5px;
+    text-decoration: none;
+    display: block;
+}
+  
+.colab-dropdown-content button:hover {background-color: #eee;}
+  
+.colab-dropdown:hover .colab-dropdown-content {display: block;}
+
+/* Version control */
+
+.version-button {
+    background-color: #6670FF;
+    color: white;
+    border: none;
+    padding: 5px;
+    font-size: 15px;
+    cursor: pointer;
+}
+
+.version-button:hover, .version-button:focus {
+    background-color: #A6B0FF;
+}
+ 
+.version-dropdown {
+    display: none;
+    background-color: #6670FF;
+    min-width: 160px;
+    overflow: auto;
+    font-size: 15px;
+}
+  
+.version-dropdown a {
+    color: white;
+    padding: 3px 4px;
+    text-decoration: none;
+    display: block;
+}
+  
+.version-dropdown a:hover {
+    background-color: #A6B0FF;
+}
+  
+.version-show {
+    display: block;
+}
+
+/* Framework selector */
+
 .framework-selector {
    display: flex;
    flex-direction: row;
    justify-content: flex-end;
+    margin-right: 30px;
 }

 .framework-selector > button {
@@ -20,6 +92,12 @@
    padding: 5px;
 }

+/* Copy button */
+
+a.copybtn {
+    margin: 3px;
+}
+
 /* The literal code blocks */
 .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
    color: #6670FF;
@@ -38,6 +116,7 @@

 /* The research field on top of the toc tree */
 .wy-side-nav-search{
+    padding-top: 0;
    background-color: #6670FF;
 }

--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,3 +1,40 @@
+// These two things need to be updated at each release for the version selector.
+// Last stable version
+const stableVersion = "v3.1.0"
+// Dictionary doc folder to label
+const versionMapping = {
+    "master": "master",
+    "": "v3.1.0 (stable)",
+    "v3.0.2": "v3.0.0/v3.0.1/v3.0.2 (stable)",
+    "v2.11.0": "v2.11.0",
+    "v2.10.0": "v2.10.0",
+    "v2.9.1": "v2.9.0/v2.9.1",
+    "v2.8.0": "v2.8.0",
+    "v2.7.0": "v2.7.0",
+    "v2.6.0": "v2.6.0",
+    "v2.5.1": "v2.5.0/v2.5.1",
+    "v2.4.0": "v2.4.0/v2.4.1",
+    "v2.3.0": "v2.3.0",
+    "v2.2.0": "v2.2.0/v2.2.1/v2.2.2",
+    "v2.1.1": "v2.1.1",
+    "v2.0.0": "v2.0.0",
+    "v1.2.0": "v1.2.0",
+    "v1.1.0": "v1.1.0",
+    "v1.0.0": "v1.0.0"
+}
+// The page that have a notebook and therefore should have the open in colab badge.
+const hasNotebook = [
+    "benchmarks",
+    "custom_datasets",
+    "multilingual",
+    "perplexity",
+    "preprocessing",
+    "quicktour",
+    "task_summary",
+    "tokenizer_summary",
+    "training"
+];
+
 function addIcon() {
    const huggingFaceLogo = "https://huggingface.co/landing/assets/transformers-docs/huggingface_logo.svg";
    const image = document.createElement("img");
@@ -58,11 +95,94 @@ function addGithubButton() {
    document.querySelector(".wy-side-nav-search .icon-home").insertAdjacentHTML('afterend', div);
 }

+function addColabLink() {
+    const parts = location.toString().split('/');
+    const pageName = parts[parts.length - 1].split(".")[0];
+
+    if (hasNotebook.includes(pageName)) {
+        const baseURL = "https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/"
+        const linksColab = `
+        <div class="colab-dropdown">
+            <img alt="Open In Colab" src="https://colab.research.google.com/assets/colab-badge.svg">
+            <div class="colab-dropdown-content">
+                <button onclick=" window.open('${baseURL}${pageName}.ipynb')">Mixed</button>
+                <button onclick=" window.open('${baseURL}pytorch/${pageName}.ipynb')">PyTorch</button>
+                <button onclick=" window.open('${baseURL}tensorflow/${pageName}.ipynb')">TensorFlow</button>
+            </div>
+        </div>`
+        const leftMenu = document.querySelector(".wy-breadcrumbs-aside")
+        leftMenu.innerHTML = linksColab + '\n' + leftMenu.innerHTML
+    }
+}
+
+function addVersionControl() {
+    // To grab the version currently in view, we parse the url
+    const parts = location.toString().split('/');
+    let versionIndex = parts.length - 2;
+    // Index page may not have a last part with filename.html so we need to go up
+    if (parts[parts.length - 1] != "" && ! parts[parts.length - 1].match(/\.html$|^search.html?/)) {
+        versionIndex = parts.length - 1;
+    }
+    // Main classes and models are nested so we need to go deeper
+    else if (parts[versionIndex] == "main_classes" || parts[versionIndex] == "model_doc") {
+        versionIndex = versionIndex - 1;
+    } 
+    const version = parts[versionIndex];
+
+    // Menu with all the links,
+    const versionMenu = document.createElement("div");
+
+    const htmlLines = [];
+    for (const [key, value] of Object.entries(versionMapping)) {
+        let baseUrlIndex = (version == "transformers") ? versionIndex + 1: versionIndex;
+        var urlParts = parts.slice(0, baseUrlIndex);
+        if (key != "") {
+            urlParts = urlParts.concat([key]);
+        }
+        urlParts = urlParts.concat(parts.slice(versionIndex+1));
+        htmlLines.push(`<a href="${urlParts.join('/')}">${value}</a>`);
+    }
+
+    versionMenu.classList.add("version-dropdown");
+    versionMenu.innerHTML = htmlLines.join('\n');
+    
+    // Button for version selection
+    const versionButton = document.createElement("div");
+    versionButton.classList.add("version-button");
+    let label = (version == "transformers") ? stableVersion : version
+    versionButton.innerText = label.concat(" ▼");
+
+    // Toggle the menu when we click on the button
+    versionButton.addEventListener("click", () => {
+        versionMenu.classList.toggle("version-show");
+    });
+
+    // Hide the menu when we click elsewhere
+    window.addEventListener("click", (event) => {
+        if (event.target != versionButton){
+            versionMenu.classList.remove('version-show');
+        }
+    });
+
+    // Container
+    const div = document.createElement("div");
+    div.appendChild(versionButton);
+    div.appendChild(versionMenu);
+    div.style.paddingTop = '25px';
+    div.style.backgroundColor = '#6670FF';
+    div.style.display = 'block';
+    div.style.textAlign = 'center';
+
+    const scrollDiv = document.querySelector(".wy-side-scroll");
+    scrollDiv.insertBefore(div, scrollDiv.children[1]);
+}
+
 function addHfMenu() {
    const div = `
    <div class="menu">
        <a href="/welcome">🔥 Sign in</a>
        <a href="/models">🚀 Models</a>
+        <a href="http://discuss.huggingface.co">💬 Forum</a>
    </div>
    `;
    document.body.insertAdjacentHTML('afterbegin', div);
@@ -72,6 +192,8 @@ function platformToggle() {
    const codeBlocks = Array.from(document.getElementsByClassName("highlight"));
    const pytorchIdentifier = "## PYTORCH CODE";
    const tensorflowIdentifier = "## TENSORFLOW CODE";
+
+    const promptSpanIdentifier = `<span class="gp">&gt;&gt;&gt; </span>`
    const pytorchSpanIdentifier = `<span class="c1">${pytorchIdentifier}</span>`;
    const tensorflowSpanIdentifier = `<span class="c1">${tensorflowIdentifier}</span>`;

@@ -84,10 +206,22 @@ function platformToggle() {
        let tensorflowSpans;

        if(pytorchSpanPosition < tensorflowSpanPosition){
-            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, tensorflowSpanPosition);
+            const isPrompt = spans.slice(
+                spans.indexOf(tensorflowSpanIdentifier) - promptSpanIdentifier.length,
+                spans.indexOf(tensorflowSpanIdentifier)
+            ) == promptSpanIdentifier;
+            const finalTensorflowSpanPosition = isPrompt ? tensorflowSpanPosition - promptSpanIdentifier.length : tensorflowSpanPosition;
+
+            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, finalTensorflowSpanPosition);
            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, spans.length);
        }else{
-            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, pytorchSpanPosition);
+            const isPrompt = spans.slice(
+                spans.indexOf(pytorchSpanIdentifier) - promptSpanIdentifier.length,
+                spans.indexOf(pytorchSpanIdentifier)
+            ) == promptSpanIdentifier;
+            const finalPytorchSpanPosition = isPrompt ? pytorchSpanPosition - promptSpanIdentifier.length : pytorchSpanPosition;
+
+            tensorflowSpans = spans.slice(tensorflowSpanPosition + tensorflowSpanIdentifier.length + 1, finalPytorchSpanPosition);
            pytorchSpans = spans.slice(pytorchSpanPosition + pytorchSpanIdentifier.length + 1, spans.length);
        }

@@ -149,10 +283,12 @@ function parseGithubButtons (){"use strict";var e=window.document,t=e.location,o

 function onLoad() {
    addIcon();
+    addVersionControl();
    addCustomFooter();
    addGithubButton();
    parseGithubButtons();
    addHfMenu();
+    addColabLink();
    platformToggle();
 }

--- a/docs/source/benchmarks.md
+++ b/docs/source/benchmarks.md
@@ -1,54 +0,0 @@
-# Benchmarks
-
-This section is dedicated to the Benchmarks done by the library, both by maintainers, contributors and users. These 
-benchmark will help keep track of the preformance improvements that are brought to our models across versions.
-
-## Benchmarking all models for inference
-
-As of version 2.1 we have benchmarked all models for inference, across many different settings: using PyTorch, with
-and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for
-TensorFlow XLA) and GPUs.
-
-The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2)
-
-The results are available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing).
-
-## TF2 with mixed precision, XLA, Distribution (@tlkh)
-
-This work was done by [Timothy Liu](https://github.com/tlkh).
-
-There are very positive results to be gained from the various TensorFlow 2.0 features:
-
- Automatic Mixed Precision (AMP)
- XLA compiler
- Distribution strategies (multi-GPU)
-
-The benefits are listed here (tested on CoLA, MRPC, SST-2):
-
- AMP: Between 1.4x to 1.6x decrease in overall time without change in batch size
- AMP+XLA: Up to 2.5x decrease in overall time on SST-2 (larger dataset)
- Distribution: Between 1.4x to 3.4x decrease in overall time on 4xV100
- Combined: Up to 5.7x decrease in overall training time, or 9.1x training throughput
-
-The model quality (measured by the validation accuracy) fluctuates slightly. Taking an average of 4 training runs 
-on a single GPU gives the following results:
-
- CoLA: AMP results in slighter lower acc (0.820 vs 0.824)
- MRPC: AMP results in lower acc (0.823 vs 0.835)
- SST-2: AMP results in slighter lower acc (0.918 vs 0.922)
-
-However, in a distributed setting with 4xV100 (4x batch size), AMP can yield in better results:
-
-CoLA: AMP results in higher acc (0.828 vs 0.812)
-MRPC: AMP results in lower acc (0.817 vs 0.827)
-SST-2: AMP results in slightly lower acc (0.926 vs 0.929)
-
-The benchmark script is available [here](https://github.com/NVAITC/benchmarking/blob/master/tf2/bert_dist.py).
-
-Note: on some tasks (e.g. MRPC), the dataset is too small. The overhead due to the model compilation with XLA as well
-as the distribution strategy setup does not speed things up. The XLA compile time is also the reason why although throughput 
-can increase a lot (e.g. 2.7x for single GPU), overall (end-to-end) training speed-up is not as fast (as low as 1.4x)
-
-The benefits as seen on SST-2 (larger dataset) is much clear.
-
-All results can be seen on this [Google Sheet](https://docs.google.com/spreadsheets/d/1538MN224EzjbRL239sqSiUy6YY-rAjHyXhTzz_Zptls/edit#gid=960868445).
--- a/docs/source/benchmarks.rst
+++ b/docs/source/benchmarks.rst
@@ -0,0 +1,322 @@
+Benchmarks
+==========
+
+Let's take a look at how 🤗 Transformer models can be benchmarked, best practices, and already available benchmarks.
+
+A notebook explaining in more detail how to benchmark 🤗 Transformer models can be found `here <https://github.com/huggingface/transformers/blob/master/notebooks/05-benchmark.ipynb>`__.
+
+How to benchmark 🤗 Transformer models
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The classes :class:`~transformers.PyTorchBenchmark` and :class:`~transformers.TensorFlowBenchmark` allow to flexibly benchmark 🤗 Transformer models.
+The benchmark classes allow us to measure the `peak memory usage` and `required time` for both 
+`inference` and `training`. 
+
+.. note::
+
+  Hereby, `inference` is defined by a single forward pass, and `training` is defined by a single forward pass and backward pass.
+
+The benchmark classes :class:`~transformers.PyTorchBenchmark` and :class:`~transformers.TensorFlowBenchmark` expect an object of type :class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments`, respectively, for instantiation. :class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments` are data classes and contain all relevant configurations for their corresponding benchmark class.
+In the following example, it is shown how a BERT model of type `bert-base-cased` can be benchmarked.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments
+
+    >>> args = PyTorchBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+    >>> benchmark = PyTorchBenchmark(args)
+
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments
+
+    >>> args = TensorFlowBenchmarkArguments(models=["bert-base-uncased"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+    >>> benchmark = TensorFlowBenchmark(args)
+
+
+Here, three arguments are given to the benchmark argument data classes, namely ``models``, ``batch_sizes``, and ``sequence_lengths``. The argument ``models`` is required and expects a :obj:`list` of model identifiers from the `model hub <https://huggingface.co/models>`__
+The :obj:`list` arguments ``batch_sizes`` and ``sequence_lengths`` define the size of the ``input_ids`` on which the model is benchmarked. 
+There are many more parameters that can be configured via the benchmark argument data classes. For more detail on these one can either directly consult the files 
+``src/transformers/benchmark/benchmark_args_utils.py``, ``src/transformers/benchmark/benchmark_args.py`` (for PyTorch) and ``src/transformers/benchmark/benchmark_args_tf.py`` (for Tensorflow). 
+Alternatively, running the following shell commands from root will print out a descriptive list of all configurable parameters for PyTorch and Tensorflow respectively.
+
+.. code-block:: bash
+
+    ## PYTORCH CODE
+    python examples/benchmarking/run_benchmark.py --help
+
+    ## TENSORFLOW CODE
+    python examples/benchmarking/run_benchmark_tf.py --help
+
+
+An instantiated benchmark object can then simply be run by calling ``benchmark.run()``.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> results = benchmark.run()
+    >>> print(results)
+    ====================       INFERENCE - SPEED - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length     Time in s                  
+    --------------------------------------------------------------------------------
+    bert-base-uncased          8               8             0.006     
+    bert-base-uncased          8               32            0.006     
+    bert-base-uncased          8              128            0.018     
+    bert-base-uncased          8              512            0.088     
+    --------------------------------------------------------------------------------
+    
+    ====================      INFERENCE - MEMORY - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length    Memory in MB 
+    --------------------------------------------------------------------------------
+    bert-base-uncased          8               8             1227
+    bert-base-uncased          8               32            1281
+    bert-base-uncased          8              128            1307
+    bert-base-uncased          8              512            1539
+    --------------------------------------------------------------------------------
+    
+    ====================        ENVIRONMENT INFORMATION         ====================
+    - transformers_version: 2.11.0
+    - framework: PyTorch
+    - use_torchscript: False
+    - framework_version: 1.4.0
+    - python_version: 3.6.10
+    - system: Linux
+    - cpu: x86_64
+    - architecture: 64bit
+    - date: 2020-06-29
+    - time: 08:58:43.371351
+    - fp16: False
+    - use_multiprocessing: True
+    - only_pretrain_model: False
+    - cpu_ram_mb: 32088
+    - use_gpu: True
+    - num_gpus: 1
+    - gpu: TITAN RTX
+    - gpu_ram_mb: 24217
+    - gpu_power_watts: 280.0
+    - gpu_performance_state: 2
+    - use_tpu: False
+    
+    >>> ## TENSORFLOW CODE
+    >>> results = benchmark.run()
+    >>> print(results)
+    ====================       INFERENCE - SPEED - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length     Time in s                  
+    --------------------------------------------------------------------------------
+    bert-base-uncased          8               8             0.005
+    bert-base-uncased          8               32            0.008
+    bert-base-uncased          8              128            0.022
+    bert-base-uncased          8              512            0.105
+    --------------------------------------------------------------------------------
+    
+    ====================      INFERENCE - MEMORY - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length    Memory in MB 
+    --------------------------------------------------------------------------------
+    bert-base-uncased          8               8             1330
+    bert-base-uncased          8               32            1330
+    bert-base-uncased          8              128            1330
+    bert-base-uncased          8              512            1770
+    --------------------------------------------------------------------------------
+    
+    ====================        ENVIRONMENT INFORMATION         ====================
+    - transformers_version: 2.11.0
+    - framework: Tensorflow
+    - use_xla: False
+    - framework_version: 2.2.0
+    - python_version: 3.6.10
+    - system: Linux
+    - cpu: x86_64
+    - architecture: 64bit
+    - date: 2020-06-29
+    - time: 09:26:35.617317
+    - fp16: False
+    - use_multiprocessing: True
+    - only_pretrain_model: False
+    - cpu_ram_mb: 32088
+    - use_gpu: True
+    - num_gpus: 1
+    - gpu: TITAN RTX
+    - gpu_ram_mb: 24217
+    - gpu_power_watts: 280.0
+    - gpu_performance_state: 2
+    - use_tpu: False
+
+By default, the `time` and the `required memory` for `inference` are benchmarked. 
+In the example output above the first two sections show the result corresponding to `inference time` and `inference memory`. 
+In addition, all relevant information about the computing environment, `e.g.` the GPU type, the system, the library versions, etc... are printed out in the third section under `ENVIRONMENT INFORMATION`.
+This information can optionally be saved in a `.csv` file when adding the argument :obj:`save_to_csv=True` to :class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments` respectively.
+In this case, every section is saved in a separate `.csv` file. The path to each `.csv` file can optionally be defined via the argument data classes.
+
+Instead of benchmarking pre-trained models via their model identifier, `e.g.` `bert-base-uncased`, the user can alternatively benchmark an arbitrary configuration of any available model class. 
+In this case, a :obj:`list` of configurations must be inserted with the benchmark args as follows.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import PyTorchBenchmark, PyTorchBenchmarkArguments, BertConfig
+
+    >>> args = PyTorchBenchmarkArguments(models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+    >>> config_base = BertConfig()
+    >>> config_384_hid = BertConfig(hidden_size=384)
+    >>> config_6_lay = BertConfig(num_hidden_layers=6)
+
+    >>> benchmark = PyTorchBenchmark(args, configs=[config_base, config_384_hid, config_6_lay])
+    >>> benchmark.run()
+    ====================       INFERENCE - SPEED - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length       Time in s                  
+    --------------------------------------------------------------------------------
+    bert-base                  8              128            0.006
+    bert-base                  8              512            0.006
+    bert-base                  8              128            0.018     
+    bert-base                  8              512            0.088     
+    bert-384-hid              8               8             0.006     
+    bert-384-hid              8               32            0.006     
+    bert-384-hid              8              128            0.011     
+    bert-384-hid              8              512            0.054     
+    bert-6-lay                 8               8             0.003     
+    bert-6-lay                 8               32            0.004     
+    bert-6-lay                 8              128            0.009     
+    bert-6-lay                 8              512            0.044
+    --------------------------------------------------------------------------------
+    
+    ====================      INFERENCE - MEMORY - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length      Memory in MB 
+    --------------------------------------------------------------------------------
+    bert-base                  8               8             1277
+    bert-base                  8               32            1281
+    bert-base                  8              128            1307     
+    bert-base                  8              512            1539     
+    bert-384-hid              8               8             1005     
+    bert-384-hid              8               32            1027     
+    bert-384-hid              8              128            1035     
+    bert-384-hid              8              512            1255     
+    bert-6-lay                 8               8             1097     
+    bert-6-lay                 8               32            1101     
+    bert-6-lay                 8              128            1127     
+    bert-6-lay                 8              512            1359
+    --------------------------------------------------------------------------------
+    
+    ====================        ENVIRONMENT INFORMATION         ====================
+    - transformers_version: 2.11.0
+    - framework: PyTorch
+    - use_torchscript: False
+    - framework_version: 1.4.0
+    - python_version: 3.6.10
+    - system: Linux
+    - cpu: x86_64
+    - architecture: 64bit
+    - date: 2020-06-29
+    - time: 09:35:25.143267
+    - fp16: False
+    - use_multiprocessing: True
+    - only_pretrain_model: False
+    - cpu_ram_mb: 32088
+    - use_gpu: True
+    - num_gpus: 1
+    - gpu: TITAN RTX
+    - gpu_ram_mb: 24217
+    - gpu_power_watts: 280.0
+    - gpu_performance_state: 2
+    - use_tpu: False
+
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TensorFlowBenchmark, TensorFlowBenchmarkArguments, BertConfig
+
+    >>> args = TensorFlowBenchmarkArguments(models=["bert-base", "bert-384-hid", "bert-6-lay"], batch_sizes=[8], sequence_lengths=[8, 32, 128, 512])
+    >>> config_base = BertConfig()
+    >>> config_384_hid = BertConfig(hidden_size=384)
+    >>> config_6_lay = BertConfig(num_hidden_layers=6)
+
+    >>> benchmark = TensorFlowBenchmark(args, configs=[config_base, config_384_hid, config_6_lay])
+    >>> benchmark.run()
+    ====================       INFERENCE - SPEED - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length       Time in s                  
+    --------------------------------------------------------------------------------
+    bert-base                  8               8             0.005
+    bert-base                  8               32            0.008
+    bert-base                  8              128            0.022
+    bert-base                  8              512            0.106
+    bert-384-hid              8               8             0.005
+    bert-384-hid              8               32            0.007
+    bert-384-hid              8              128            0.018
+    bert-384-hid              8              512            0.064
+    bert-6-lay                 8               8             0.002
+    bert-6-lay                 8               32            0.003
+    bert-6-lay                 8              128            0.0011
+    bert-6-lay                 8              512            0.074
+    --------------------------------------------------------------------------------
+    
+    ====================      INFERENCE - MEMORY - RESULT       ====================
+    --------------------------------------------------------------------------------
+    Model Name             Batch Size     Seq Length      Memory in MB 
+    --------------------------------------------------------------------------------
+    bert-base                  8               8             1330
+    bert-base                  8               32            1330
+    bert-base                  8              128            1330
+    bert-base                  8              512            1770
+    bert-384-hid              8               8             1330
+    bert-384-hid              8               32            1330
+    bert-384-hid              8              128            1330
+    bert-384-hid              8              512            1540
+    bert-6-lay                 8               8             1330
+    bert-6-lay                 8               32            1330
+    bert-6-lay                 8              128            1330
+    bert-6-lay                 8              512            1540
+    --------------------------------------------------------------------------------
+    
+    ====================        ENVIRONMENT INFORMATION         ====================
+    - transformers_version: 2.11.0
+    - framework: Tensorflow
+    - use_xla: False
+    - framework_version: 2.2.0
+    - python_version: 3.6.10
+    - system: Linux
+    - cpu: x86_64
+    - architecture: 64bit
+    - date: 2020-06-29
+    - time: 09:38:15.487125
+    - fp16: False
+    - use_multiprocessing: True
+    - only_pretrain_model: False
+    - cpu_ram_mb: 32088
+    - use_gpu: True
+    - num_gpus: 1
+    - gpu: TITAN RTX
+    - gpu_ram_mb: 24217
+    - gpu_power_watts: 280.0
+    - gpu_performance_state: 2
+    - use_tpu: False
+
+
+Again, `inference time` and `required memory` for `inference` are measured, but this time for customized configurations of the :obj:`BertModel` class. This feature can especially be helpful when 
+deciding for which configuration the model should be trained.
+
+
+Benchmark best practices
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+This section lists a couple of best practices one should be aware of when benchmarking a model.
+
+- Currently, only single device benchmarking is supported. When benchmarking on GPU, it is recommended that the user 
+  specifies on which device the code should be run by setting the ``CUDA_VISIBLE_DEVICES`` environment variable in the shell, `e.g.` ``export CUDA_VISIBLE_DEVICES=0`` before running the code.
+- The option :obj:`no_multi_processing` should only be set to :obj:`True` for testing and debugging. To ensure accurate memory measurement it is recommended to run each memory benchmark in a separate process by making sure :obj:`no_multi_processing` is set to :obj:`True`.
+- One should always state the environment information when sharing the results of a model benchmark. Results can vary heavily between different GPU devices, library versions, etc., so that benchmark results on their own are not very useful for the community.
+
+
+Sharing your benchmark
+~~~~~~~~~~~~~~~~~~~~~~
+
+Previously all available core models (10 at the time) have been benchmarked for `inference time`, across many different settings: using PyTorch, with
+and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for
+TensorFlow XLA) and GPUs.
+
+The approach is detailed in the `following blogpost <https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2>`__ and the results are available `here <https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing>`__.
+
+With the new `benchmark` tools, it is easier than ever to share your benchmark results with the community `here <https://github.com/huggingface/transformers/blob/master/examples/benchmarking/README.md>`__.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,7 +26,7 @@ author = u'huggingface'
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'2.11.0'
+release = u'3.2.0'


 # -- General configuration ---------------------------------------------------
@@ -44,7 +44,8 @@ extensions = [
    'sphinx.ext.napoleon',
    'recommonmark',
    'sphinx.ext.viewcode',
-    'sphinx_markdown_tables'
+    'sphinx_markdown_tables',
+    'sphinx_copybutton'
 ]

 # Add any paths that contain templates here, relative to this directory.
@@ -74,6 +75,9 @@ exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
 # The name of the Pygments (syntax highlighting) style to use.
 pygments_style = None

+# Remove the prompt when copying examples
+copybutton_prompt_text = r">>> |\.\.\. "
+copybutton_prompt_is_regexp = True

 # -- Options for HTML output -------------------------------------------------

@@ -187,8 +191,8 @@ epub_title = project
 epub_exclude_files = ['search.html']

 def setup(app):
-    app.add_stylesheet('css/huggingface.css')
-    app.add_stylesheet('css/code-snippets.css')
+    app.add_css_file('css/huggingface.css')
+    app.add_css_file('css/code-snippets.css')
    app.add_js_file('js/custom.js')

 # -- Extension configuration -------------------------------------------------
--- a/docs/source/contributing.md
+++ b/docs/source/contributing.md
@@ -0,0 +1 @@
+../../CONTRIBUTING.md
--- a/docs/source/custom_datasets.rst
+++ b/docs/source/custom_datasets.rst
@@ -0,0 +1,715 @@
+Fine-tuning with custom datasets
+================================
+
+.. note::
+
+    The datasets used in this tutorial are available and can be more easily accessed using the
+    `🤗 NLP library <https://github.com/huggingface/nlp>`_. We do not use this library to access the datasets here
+    since this tutorial meant to illustrate how to work with your own data. A brief of introduction can be found
+    at the end of the tutorial in the section ":ref:`nlplib`".
+
+This tutorial will take you through several examples of using 🤗 Transformers models with your own datasets. The
+guide shows one of many valid workflows for using these models and is meant to be illustrative rather than
+definitive. We show examples of reading in several data formats, preprocessing the data for several types of tasks,
+and then preparing the data into PyTorch/TensorFlow ``Dataset`` objects which can easily be used either with
+:class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` or with native PyTorch/TensorFlow.
+
+We include several examples, each of which demonstrates a different type of common downstream task:
+
+  - :ref:`seq_imdb`
+  - :ref:`tok_ner`
+  - :ref:`qa_squad`
+  - :ref:`resources`
+
+.. _seq_imdb:
+
+Sequence Classification with IMDb Reviews
+-----------------------------------------
+
+.. note::
+
+    This dataset can be explored in the Hugging Face model hub (`IMDb <https://huggingface.co/datasets/imdb>`_), and can
+    be alternatively downloaded with the 🤗 NLP library with ``load_dataset("imdb")``.
+
+In this example, we'll show how to download, tokenize, and train a model on the IMDb reviews dataset. This task
+takes the text of a review and requires the model to predict whether the sentiment of the review is positive or
+negative. Let's start by downloading the dataset from the
+`Large Movie Review Dataset <http://ai.stanford.edu/~amaas/data/sentiment/>`_ webpage.
+
+.. code-block:: bash
+
+    wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
+    tar -xf aclImdb_v1.tar.gz
+
+This data is organized into ``pos`` and ``neg`` folders with one text file per example. Let's write a function that can
+read this in.
+
+.. code-block:: python
+
+    from pathlib import Path
+
+    def read_imdb_split(split_dir):
+        split_dir = Path(split_dir)
+        texts = []
+        labels = []
+        for label_dir in ["pos", "neg"]:
+            for text_file in (split_dir/label_dir).iterdir():
+                texts.append(text_file.read_text())
+                labels.append(0 if label_dir is "neg" else 1)
+
+        return texts, labels
+
+    train_texts, train_labels = read_imdb_split('aclImdb/train')
+    test_texts, test_labels = read_imdb_split('aclImdb/test')
+
+We now have a train and test dataset, but let's also also create a validation set which we can use for for
+evaluation and tuning without training our test set results. Sklearn has a convenient utility for creating such
+splits:
+
+.. code-block:: python
+
+    from sklearn.model_selection import train_test_split
+    train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
+
+Alright, we've read in our dataset. Now let's tackle tokenization. We'll eventually train a classifier using
+pre-trained DistilBert, so let's use the DistilBert tokenizer.
+
+.. code-block:: python
+
+    from transformers import DistilBertTokenizerFast
+    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
+
+Now we can simply pass our texts to the tokenizer. We'll pass ``truncation=True`` and ``padding=True``, which will
+ensure that all of our sequences are padded to the same length and are truncated to be no longer model's maximum
+input length. This will allow us to feed batches of sequences into the model at the same time.
+
+.. code-block:: python
+
+    train_encodings = tokenizer(train_texts, truncation=True, padding=True)
+    val_encodings = tokenizer(val_texts, truncation=True, padding=True)
+    test_encodings = tokenizer(test_texts, truncation=True, padding=True)
+
+Now, let's turn our labels and encodings into a Dataset object. In PyTorch, this is done by subclassing a
+``torch.utils.data.Dataset`` object and implementing ``__len__`` and ``__getitem__``. In TensorFlow, we pass our input encodings and
+labels to the ``from_tensor_slices`` constructor method. We put the data in this format so that the data can be
+easily batched such that each key in the batch encoding corresponds to a named parameter of the
+:meth:`~transformers.DistilBertForSequenceClassification.forward` method of the model we will train.
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    import torch
+
+    class IMDbDataset(torch.utils.data.Dataset):
+        def __init__(self, encodings, labels):
+            self.encodings = encodings
+            self.labels = labels
+
+        def __getitem__(self, idx):
+            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+            item['labels'] = torch.tensor(self.labels[idx])
+            return item
+
+        def __len__(self):
+            return len(self.labels)
+
+    train_dataset = IMDbDataset(train_encodings, train_labels)
+    val_dataset = IMDbDataset(val_encodings, val_labels)
+    test_dataset = IMDbDataset(test_encodings, test_labels)
+    ## TENSORFLOW CODE
+    import tensorflow as tf
+
+    train_dataset = tf.data.Dataset.from_tensor_slices((
+        dict(train_encodings),
+        train_labels
+    ))
+    val_dataset = tf.data.Dataset.from_tensor_slices((
+        dict(val_encodings),
+        val_labels
+    ))
+    test_dataset = tf.data.Dataset.from_tensor_slices((
+        dict(test_encodings),
+        test_labels
+    ))
+
+Now that our datasets our ready, we can fine-tune a model either with the 🤗
+:class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` or with native PyTorch/TensorFlow. See
+:doc:`training <training>`.
+
+.. _ft_trainer:
+
+Fine-tuning with Trainer
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+The steps above prepared the datasets in the way that the trainer is expected. Now all we need to do is create a
+model to fine-tune, define the :class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments`
+and instantiate a :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`.
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
+
+    training_args = TrainingArguments(
+        output_dir='./results',          # output directory
+        num_train_epochs=3,              # total number of training epochs
+        per_device_train_batch_size=16,  # batch size per device during training
+        per_device_eval_batch_size=64,   # batch size for evaluation
+        warmup_steps=500,                # number of warmup steps for learning rate scheduler
+        weight_decay=0.01,               # strength of weight decay
+        logging_dir='./logs',            # directory for storing logs
+        logging_steps=10,
+    )
+
+    model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+
+    trainer = Trainer(
+        model=model,                         # the instantiated 🤗 Transformers model to be trained
+        args=training_args,                  # training arguments, defined above
+        train_dataset=train_dataset,         # training dataset
+        eval_dataset=val_dataset             # evaluation dataset
+    )
+
+    trainer.train()
+    ## TENSORFLOW CODE
+    from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
+
+    training_args = TFTrainingArguments(
+        output_dir='./results',          # output directory
+        num_train_epochs=3,              # total number of training epochs
+        per_device_train_batch_size=16,  # batch size per device during training
+        per_device_eval_batch_size=64,   # batch size for evaluation
+        warmup_steps=500,                # number of warmup steps for learning rate scheduler
+        weight_decay=0.01,               # strength of weight decay
+        logging_dir='./logs',            # directory for storing logs
+        logging_steps=10,
+    )
+
+    with training_args.strategy.scope():
+        model = TFDistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased")
+
+    trainer = TFTrainer(
+        model=model,                         # the instantiated 🤗 Transformers model to be trained
+        args=training_args,                  # training arguments, defined above
+        train_dataset=train_dataset,         # training dataset
+        eval_dataset=val_dataset             # evaluation dataset
+    )
+
+    trainer.train()
+
+.. _ft_native:
+
+Fine-tuning with native PyTorch/TensorFlow
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We can also train use native PyTorch or TensorFlow:
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    from torch.utils.data import DataLoader
+    from transformers import DistilBertForSequenceClassification, AdamW
+
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+    model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
+    model.to(device)
+    model.train()
+
+    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
+
+    optim = AdamW(model.parameters(), lr=5e-5)
+
+    for epoch in range(3):
+        for batch in train_loader:
+            optim.zero_grad()
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            labels = batch['labels'].to(device)
+            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
+            loss = outputs[0]
+            loss.backward()
+            optim.step()
+
+    model.eval()
+    ## TENSORFLOW CODE
+    from transformers import TFDistilBertForSequenceClassification
+
+    model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased')
+
+    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
+    model.compile(optimizer=optimizer, loss=model.compute_loss) # can also use any keras loss fn
+    model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)
+
+.. _tok_ner:
+
+Token Classification with W-NUT Emerging Entities
+-------------------------------------------------
+
+.. note::
+
+    This dataset can be explored in the Hugging Face model hub (`WNUT-17 <https://huggingface.co/datasets/wnut_17>`_), and can
+    be alternatively downloaded with the 🤗 NLP library with ``load_dataset("wnut_17")``.
+
+Next we will look at token classification. Rather than classifying an entire sequence, this task classifies token by
+token. We'll demonstrate how to do this with 
+`Named Entity Recognition <http://nlpprogress.com/english/named_entity_recognition.html>`_, which involves
+identifying tokens which correspond to a predefined set of "entities". Specifically, we'll use the
+`W-NUT Emerging and Rare entities <http://noisy-text.github.io/2017/emerging-rare-entities.html>`_ corpus. The data
+is given as a collection of pre-tokenized documents where each token is assigned a tag.
+
+Let's start by downloading the data.
+
+.. code-block:: bash
+
+    wget http://noisy-text.github.io/2017/files/wnut17train.conll
+
+In this case, we'll just download the train set, which is a single text file. Each line of the file contains either
+(1) a word and tag separated by a tab, or (2) a blank line indicating the end of a document. Let's write a
+function to read this in. We'll take in the file path and return ``token_docs`` which is a list of lists of token
+strings, and ``token_tags`` which is a list of lists of tag strings.
+
+.. code-block:: python
+
+    from pathlib import Path
+    import re
+
+    def read_wnut(file_path):
+        file_path = Path(file_path)
+
+        raw_text = file_path.read_text().strip()
+        raw_docs = re.split(r'\n\t?\n', raw_text)
+        token_docs = []
+        tag_docs = []
+        for doc in raw_docs:
+            tokens = []
+            tags = []
+            for line in doc.split('\n'):
+                token, tag = line.split('\t')
+                tokens.append(token)
+                tags.append(tag)
+            token_docs.append(tokens)
+            tag_docs.append(tags)
+        
+        return token_docs, tag_docs
+    
+    texts, tags = read_wnut('wnut17train.conll')
+    
+Just to see what this data looks like, let's take a look at a segment of the first document.
+
+.. code-block:: python
+
+    >>> print(texts[0][10:17], tags[0][10:17], sep='\n')
+    ['for', 'two', 'weeks', '.', 'Empire', 'State', 'Building']
+    ['O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location']
+
+``location`` is an entity type, ``B-`` indicates the beginning of an entity, and ``I-`` indicates consecutive positions of
+the same entity ("Empire State Building" is considered one entity). ``O`` indicates the token does not correspond to
+any entity.
+
+Now that we've read the data in, let's create a train/validation split:
+
+.. code-block:: python
+
+    from sklearn.model_selection import train_test_split
+    train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)
+
+Next, let's create encodings for our tokens and tags. For the tags, we can start by just create a simple mapping
+which we'll use in a moment:
+
+.. code-block:: python
+
+    unique_tags = set(tag for doc in tags for tag in doc)
+    tag2id = {tag: id for id, tag in enumerate(unique_tags)}
+    id2tag = {id: tag for tag, id in tag2id.items()}
+
+To encode the tokens, we'll use a pre-trained DistilBert tokenizer. We can tell the tokenizer that we're dealing
+with ready-split tokens rather than full sentence strings by passing ``is_split_into_words=True``. We'll also pass
+``padding=True`` and ``truncation=True`` to pad the sequences to be the same length. Lastly, we can tell the model
+to return information about the tokens which are split by the wordpiece tokenization process, which we will need in
+a moment.
+
+.. code-block:: python
+
+    from transformers import DistilBertTokenizerFast
+    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
+    train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
+    val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
+
+Great, so now our tokens are nicely encoded in the format that they need to be in to feed them into our DistilBert
+model below.
+
+Now we arrive at a common obstacle with using pre-trained models for token-level classification: many of the tokens
+in the W-NUT corpus are not in DistilBert's vocabulary. Bert and many models like it use a method called WordPiece
+Tokenization, meaning that single words are split into multiple tokens such that each token is likely to be in
+the vocabulary. For example, DistilBert's tokenizer would split the Twitter handle ``@huggingface`` into the tokens
+``['@', 'hugging', '##face']``. This is a problem for us because we have exactly one tag per token. If the tokenizer
+splits a token into multiple sub-tokens, then we will end up with a mismatch between our tokens and our labels.
+
+One way to handle this is to only train on the tag labels for the first subtoken of a split token. We can do this in
+🤗 Transformers by setting the labels we wish to ignore to ``-100``. In the example above, if the label for
+``@HuggingFace`` is ``3`` (indexing ``B-corporation``), we would set the labels of ``['@', 'hugging', '##face']`` to
+``[3, -100, -100]``.
+
+Let's write a function to do this. This is where we will use the ``offset_mapping`` from the tokenizer as mentioned
+above. For each sub-token returned by the tokenizer, the offset mapping gives us a tuple indicating the sub-token's
+start position and end position relative to the original token it was split from. That means that if the first
+position in the tuple is anything other than ``0``, we will set its corresponding label to ``-100``. While we're at
+it, we can also set labels to ``-100`` if the second position of the offset mapping is ``0``, since this means it must
+be a special token like ``[PAD]`` or ``[CLS]``.
+
+.. note:: 
+
+    Due to a recently fixed bug, -1 must be used instead of -100 when using TensorFlow in 🤗 Transformers <= 3.02.
+
+.. code-block:: python
+
+    import numpy as np
+
+    def encode_tags(tags, encodings):
+        labels = [[tag2id[tag] for tag in doc] for doc in tags]
+        encoded_labels = []
+        for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
+            # create an empty array of -100
+            doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
+            arr_offset = np.array(doc_offset)
+
+            # set labels whose first offset position is 0 and the second is not 0
+            doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
+            encoded_labels.append(doc_enc_labels.tolist())
+
+        return encoded_labels
+    
+    train_labels = encode_tags(train_tags, train_encodings)
+    val_labels = encode_tags(val_tags, val_encodings)
+
+The hard part is now done. Just as in the sequence classification example above, we can create a dataset object:
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    import torch
+
+    class WNUTDataset(torch.utils.data.Dataset):
+        def __init__(self, encodings, labels):
+            self.encodings = encodings
+            self.labels = labels
+
+        def __getitem__(self, idx):
+            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+            item['labels'] = torch.tensor(self.labels[idx])
+            return item
+
+        def __len__(self):
+            return len(self.labels)
+
+    train_encodings.pop("offset_mapping") # we don't want to pass this to the model
+    val_encodings.pop("offset_mapping")
+    train_dataset = WNUTDataset(train_encodings, train_labels)
+    val_dataset = WNUTDataset(val_encodings, val_labels)
+    ## TENSORFLOW CODE
+    import tensorflow as tf
+
+    train_encodings.pop("offset_mapping") # we don't want to pass this to the model
+    val_encodings.pop("offset_mapping")
+
+    train_dataset = tf.data.Dataset.from_tensor_slices((
+        dict(train_encodings),
+        train_labels
+    ))
+    val_dataset = tf.data.Dataset.from_tensor_slices((
+        dict(val_encodings),
+        val_labels
+    ))
+
+Now load in a token classification model and specify the number of labels:
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    from transformers import DistilBertForTokenClassification
+    model = DistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))
+    ## TENSORFLOW CODE
+    from transformers import TFDistilBertForTokenClassification
+    model = TFDistilBertForTokenClassification.from_pretrained('distilbert-base-cased', num_labels=len(unique_tags))
+
+The data and model are both ready to go. You can train the model either with
+:class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` or with native PyTorch/TensorFlow, exactly as in the
+sequence classification example above.
+
+  - :ref:`ft_trainer`
+  - :ref:`ft_native`
+
+.. _qa_squad:
+
+Question Answering with SQuAD 2.0
+---------------------------------
+
+.. note::
+
+    This dataset can be explored in the Hugging Face model hub (`SQuAD V2 <https://huggingface.co/datasets/squad_v2>`_), and can
+    be alternatively downloaded with the 🤗 NLP library with ``load_dataset("squad_v2")``.
+
+Question answering comes in many forms. In this example, we'll look at the particular type of extractive QA that
+involves answering a question about a passage by highlighting the segment of the passage that answers the question.
+This involves fine-tuning a model which predicts a start position and an end position in the passage. We will use the
+`Stanford Question Answering Dataset (SQuAD) 2.0 <https://rajpurkar.github.io/SQuAD-explorer/>`_.
+
+We will start by downloading the data:
+
+.. code-block:: bash
+
+    mkdir squad
+    wget https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json -O squad/train-v2.0.json
+    wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json
+
+Each split is in a structured json file with a number of questions and answers for each passage (or context). We'll
+take this apart into parallel lists of contexts, questions, and answers (note that the contexts here are repeated
+since there are multiple questions per context):
+
+.. code-block:: python
+
+    import json
+    from pathlib import Path
+
+    def read_squad(path):
+        path = Path(path)
+        with open(path, 'rb') as f:
+            squad_dict = json.load(f)
+
+        contexts = []
+        questions = []
+        answers = []
+        for group in squad_dict['data']:
+            for passage in group['paragraphs']:
+                context = passage['context']
+                for qa in passage['qas']:
+                    question = qa['question']
+                    for answer in qa['answers']:
+                        contexts.append(context)
+                        questions.append(question)
+                        answers.append(answer)
+
+        return contexts, questions, answers
+    
+    train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
+    val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')
+
+The contexts and questions are just strings. The answers are dicts containing the subsequence of the passage with
+the correct answer as well as an integer indicating the character at which the answer begins. In order to train a
+model on this data we need (1) the tokenized context/question pairs, and (2) integers indicating at which *token*
+positions the answer begins and ends.
+
+First, let's get the *character* position at which the answer ends in the passage (we are given the starting
+position). Sometimes SQuAD answers are off by one or two characters, so we will also adjust for that.
+
+.. code-block:: python
+
+    def add_end_idx(answers, contexts):
+        for answer, context in zip(answers, contexts):
+            gold_text = answer['text']
+            start_idx = answer['answer_start']
+            end_idx = start_idx + len(gold_text)
+            
+            # sometimes squad answers are off by a character or two – fix this
+            if context[start_idx:end_idx] == gold_text:
+                answer['answer_end'] = end_idx
+            elif context[start_idx-1:end_idx-1] == gold_text:
+                answer['answer_start'] = start_idx - 1
+                answer['answer_end'] = end_idx - 1     # When the gold label is off by one character
+            elif context[start_idx-2:end_idx-2] == gold_text:
+                answer['answer_start'] = start_idx - 2
+                answer['answer_end'] = end_idx - 2     # When the gold label is off by two characters
+
+    add_end_idx(train_answers, train_contexts)
+    add_end_idx(val_answers, val_contexts)
+
+Now ``train_answers`` and ``val_answers`` include the character end positions and the corrected start positions.
+Next, let's tokenize our context/question pairs. 🤗 Tokenizers can accept parallel lists of sequences and encode
+them together as sequence pairs.
+
+.. code-block:: python
+
+    from transformers import DistilBertTokenizerFast
+    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
+
+    train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
+    val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)
+
+Next we need to convert our character start/end positions to token start/end positions. When using 🤗 Fast
+Tokenizers, we can use the built in :func:`~transformers.BatchEncoding.char_to_token` method.
+
+.. code-block:: python
+
+    def add_token_positions(encodings, answers):
+        start_positions = []
+        end_positions = []
+        for i in range(len(answers)):
+            start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
+            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
+            # if None, the answer passage has been truncated
+            if start_positions[-1] is None:
+                start_positions[-1] = tokenizer.model_max_length
+            if end_positions[-1] is None:
+                end_positions[-1] = tokenizer.model_max_length
+        encodings.update({'start_positions': start_positions, 'end_positions': end_positions})
+
+    add_token_positions(train_encodings, train_answers)
+    add_token_positions(val_encodings, val_answers)
+
+Our data is ready. Let's just put it in a PyTorch/TensorFlow dataset so that we can easily use it for
+training. In PyTorch, we define a custom ``Dataset`` class. In TensorFlow, we pass a tuple of
+``(inputs_dict, labels_dict)`` to the ``from_tensor_slices`` method.
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    import torch
+
+    class SquadDataset(torch.utils.data.Dataset):
+        def __init__(self, encodings):
+            self.encodings = encodings
+
+        def __getitem__(self, idx):
+            return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
+
+        def __len__(self):
+            return len(self.encodings.input_ids)
+        
+    train_dataset = SquadDataset(train_encodings)
+    val_dataset = SquadDataset(val_encodings)
+    ## TENSORFLOW CODE
+    import tensorflow as tf
+
+    train_dataset = tf.data.Dataset.from_tensor_slices((
+        {key: train_encodings[key] for key in ['input_ids', 'attention_mask']},
+        {key: train_encodings[key] for key in ['start_positions', 'end_positions']}
+    ))
+    val_dataset = tf.data.Dataset.from_tensor_slices((
+        {key: val_encodings[key] for key in ['input_ids', 'attention_mask']},
+        {key: val_encodings[key] for key in ['start_positions', 'end_positions']}
+    ))
+
+Now we can use a DistilBert model with a QA head for training:
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    from transformers import DistilBertForQuestionAnswering
+    model = DistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+    ## TENSORFLOW CODE
+    from transformers import TFDistilBertForQuestionAnswering
+    model = TFDistilBertForQuestionAnswering.from_pretrained("distilbert-base-uncased")
+
+
+The data and model are both ready to go. You can train the model with
+:class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` exactly as in the sequence classification example
+above. If using native PyTorch, replace ``labels`` with ``start_positions`` and ``end_positions`` in the training
+example. If using Keras's ``fit``, we need to make a minor modification to handle this example since it involves
+multiple model outputs.
+
+  - :ref:`ft_trainer`
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    from torch.utils.data import DataLoader
+    from transformers import AdamW
+
+    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
+
+    model.to(device)
+    model.train()
+
+    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
+
+    optim = AdamW(model.parameters(), lr=5e-5)
+
+    for epoch in range(3):
+        for batch in train_loader:
+            optim.zero_grad()
+            input_ids = batch['input_ids'].to(device)
+            attention_mask = batch['attention_mask'].to(device)
+            start_positions = batch['start_positions'].to(device)
+            end_positions = batch['end_positions'].to(device)
+            outputs = model(input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
+            loss = outputs[0]
+            loss.backward()
+            optim.step()
+
+    model.eval()
+    ## TENSORFLOW CODE
+    # Keras will expect a tuple when dealing with labels
+    train_dataset = train_dataset.map(lambda x, y: (x, (y['start_positions'], y['end_positions'])))
+
+    # Keras will assign a separate loss for each output and add them together. So we'll just use the standard CE loss
+    # instead of using the built-in model.compute_loss, which expects a dict of outputs and averages the two terms.
+    # Note that this means the loss will be 2x of when using TFTrainer since we're adding instead of averaging them.
+    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    model.distilbert.return_dict = False # if using 🤗 Transformers >3.02, make sure outputs are tuples
+
+    optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
+    model.compile(optimizer=optimizer, loss=loss) # can also use any keras loss fn
+    model.fit(train_dataset.shuffle(1000).batch(16), epochs=3, batch_size=16)
+
+.. _resources:
+
+Additional Resources
+--------------------
+
+  - `How to train a new language model from scratch using Transformers and Tokenizers
+    <https://huggingface.co/blog/how-to-train>`_. Blog post showing the steps to load in Esperanto data and train a
+    masked language model from scratch.
+  - :doc:`Preprocessing <preprocessing>`. Docs page on data preprocessing.
+  - :doc:`Training <training>`. Docs page on training and fine-tuning.
+
+.. _nlplib:
+
+Using the 🤗 NLP Datasets & Metrics library
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This tutorial demonstrates how to read in datasets from various raw text formats and prepare them for training with
+🤗 Transformers so that you can do the same thing with your own custom datasets. However, we recommend users use the
+`🤗 NLP library <https://github.com/huggingface/nlp>`_ for working with the 150+ datasets included in the
+`hub <https://huggingface.co/datasets>`_, including the three datasets used in this tutorial. As a very brief overview,
+we will show how to use the NLP library to download and prepare the IMDb dataset from the first example,
+:ref:`seq_imdb`.
+
+Start by downloading the dataset:
+
+.. code-block:: python
+
+    from nlp import load_dataset
+    train = load_dataset("imdb", split="train")
+
+Each dataset has multiple columns corresponding to different features. Let's see what our columns are.
+
+.. code-block:: python
+
+    >>> print(train.column_names)
+    ['label', 'text']
+
+Great. Now let's tokenize the text. We can do this using the ``map`` method. We'll also rename the ``label`` column
+to ``labels`` to match the model's input arguments.
+
+.. code-block:: python
+
+    train = train.map(lambda batch: tokenizer(batch["text"], truncation=True, padding=True), batched=True)
+    train.rename_column_("label", "labels")
+
+Lastly, we can use the ``set_format`` method to determine which columns and in what data format we want to access
+dataset elements.
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    >>> train.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
+    >>> {key: val.shape for key, val in train[0].items()})
+    {'labels': torch.Size([]), 'input_ids': torch.Size([512]), 'attention_mask': torch.Size([512])}
+    ## TENSORFLOW CODE
+    >>> train.set_format("tensorflow", columns=["input_ids", "attention_mask", "labels"])
+    >>> {key: val.shape for key, val in train[0].items()})
+    {'labels': TensorShape([]), 'input_ids': TensorShape([512]), 'attention_mask': TensorShape([512])}
+
+We now have a fully-prepared dataset. Check out `the 🤗 NLP docs <https://huggingface.co/nlp/processing.html>`_ for
+a more thorough introduction.
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -1,11 +1,41 @@
 Glossary
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+^^^^^^^^
+
+General terms
+-------------
+
+- autoencoding models: see MLM
+- autoregressive models: see CLM
+- CLM: causal language modeling, a pretraining task where the model reads the texts in order and has to predict the
+  next word. It's usually done by reading the whole sentence but using a mask inside the model to hide the future
+  tokens at a certain timestep.
+- MLM: masked language modeling, a pretraining task where the model sees a corrupted version of the texts, usually done
+  by masking some tokens randomly, and has to predict the original text.
+- multimodal: a task that combines texts with another kind of inputs (for instance images).
+- NLG: natural language generation, all tasks related to generating text ( for instance talk with transformers,
+  translation)
+- NLP: natural language processing, a generic way to say "deal with texts".
+- NLU: natural language understanding, all tasks related to understanding what is in a text (for instance classifying
+  the whole text, individual words)
+- pretrained model: a model that has been pretrained on some data (for instance all of Wikipedia). Pretraining methods
+  involve a self-supervised objective, which can be reading the text and trying to predict the next word (see CLM) or
+  masking some words and trying to predict them (see MLM).
+- RNN: recurrent neural network, a type of model that uses a loop over a layer to process texts.
+- seq2seq or sequence-to-sequence: models that generate a new sequence from an input, like translation models, or
+  summarization models (such as :doc:`Bart </model_doc/bart>` or :doc:`T5 </model_doc/t5>`).
+- token: a part of a sentence, usually a word, but can also be a subword (non-common words are often split in subwords)
+  or a punctuation symbol.
+
+Model inputs
+------------

 Every model is different yet bears similarities with the others. Therefore most models use the same inputs, which are
 detailed here alongside usage examples.

+.. _input-ids:
+
 Input IDs
--------------------------
+~~~~~~~~~

 The input ids are often the only required parameters to be passed to the model as input. *They are token indices,
 numerical representations of tokens building the sequences that will be used as input by the model*.
@@ -15,33 +45,65 @@ tokenizer, which is a `WordPiece <https://arxiv.org/pdf/1609.08144.pdf>`__ token

 ::

-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

-    sequence = "A Titan RTX has 24GB of VRAM"
+    >>> sequence = "A Titan RTX has 24GB of VRAM"

 The tokenizer takes care of splitting the sequence into tokens available in the tokenizer vocabulary.

 ::

-    # Continuation of the previous script
-    tokenized_sequence = tokenizer.tokenize(sequence)
-    assert tokenized_sequence == ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
+    >>> tokenized_sequence = tokenizer.tokenize(sequence)

-These tokens can then be converted into IDs which are understandable by the model. Several methods are available for
-this, the recommended being `encode` or `encode_plus`, which leverage the Rust implementation of
+The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
+in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-hash prefix is
+added for "RA" and "M":
+
+::
+
+    >>> print(tokenized_sequence)
+    ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']
+
+These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
+the sentence to the tokenizer, which leverages the Rust implementation of
 `huggingface/tokenizers <https://github.com/huggingface/tokenizers>`__ for peak performance.

 ::

-    # Continuation of the previous script
-    encoded_sequence = tokenizer.encode(sequence)
-    assert encoded_sequence == [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
+    >>> inputs = tokenizer(sequence)

-The `encode` and `encode_plus` methods automatically add "special tokens" which are special IDs the model uses.
+The tokenizer returns a dictionary with all the arguments necessary for its corresponding model to work properly. The
+token indices are under the key "input_ids":
+
+::
+
+    >>> encoded_sequence = inputs["input_ids"]
+    >>> print(encoded_sequence)
+    [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102]
+
+Note that the tokenizer automatically adds "special tokens" (if the associated model relies on them) which are special
+IDs the model sometimes uses.
+
+If we decode the previous sequence of ids,
+
+::
+
+    >>> decoded_sequence = tokenizer.decode(encoded_sequence)
+
+we will see
+
+::
+
+    >>> print(decoded_sequence)
+    [CLS] A Titan RTX has 24GB of VRAM [SEP]
+
+because this is the way a :class:`~transformers.BertModel` is going to expect its inputs.
+
+.. _attention-mask:

 Attention mask
--------------------------
+~~~~~~~~~~~~~~

 The attention mask is an optional argument used when batching sequences together. This argument indicates to the
 model which tokens should be attended to, and which should not.
@@ -50,107 +112,130 @@ For example, consider these two sequences:

 ::

-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")

-    sequence_a = "This is a short sequence."
-    sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."
+    >>> sequence_a = "This is a short sequence."
+    >>> sequence_b = "This is a rather long sequence. It is at least longer than the sequence A."

-    encoded_sequence_a = tokenizer.encode(sequence_a)
-    assert len(encoded_sequence_a) == 8
+    >>> encoded_sequence_a = tokenizer(sequence_a)["input_ids"]
+    >>> encoded_sequence_b = tokenizer(sequence_b)["input_ids"]

-    encoded_sequence_b = tokenizer.encode(sequence_b)
-    assert len(encoded_sequence_b) == 19
-
-These two sequences have different lengths and therefore can't be put together in a same tensor as-is. The first
-sequence needs to be padded up to the length of the second one, or the second one needs to be truncated down to
-the length of the first one.
-
-In the first case, the list of IDs will be extended by the padding indices:
+The encoded versions have different lengths:

 ::

-    # Continuation of the previous script
-    padded_sequence_a = tokenizer.encode(sequence_a, max_length=19, pad_to_max_length=True)
+    >>> len(encoded_sequence_a), len(encoded_sequence_b)
+    (8, 19)

-    assert padded_sequence_a == [101, 1188, 1110, 170, 1603, 4954,  119, 102,    0,    0,    0,    0,    0,    0,    0,    0,   0,   0,   0]
-    assert encoded_sequence_b == [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]
+Therefore, we can't put them together in the same tensor as-is. The first sequence needs to be padded up to the length
+of the second one, or the second one needs to be truncated down to the length of the first one.

-These can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
+In the first case, the list of IDs will be extended by the padding indices. We can pass a list to the tokenizer and ask
+it to pad like this:
+
+::
+
+    >>> padded_sequences = tokenizer([sequence_a, sequence_b], padding=True)
+
+We can see that 0s have been added on the right of the first sentence to make it the same length as the second one:
+
+::
+
+    >>> padded_sequences["input_ids"]
+    [[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]
+
+This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
 the position of the padded indices so that the model does not attend to them. For the
-:class:`~transformers.BertTokenizer`, :obj:`1` indicate a value that should be attended to while :obj:`0` indicate
-a padded value.
-
-The method :func:`~transformers.PreTrainedTokenizer.encode_plus` may be used to obtain the attention mask directly:
+:class:`~transformers.BertTokenizer`, :obj:`1` indicates a value that should be attended to, while :obj:`0` indicates
+a padded value. This attention mask is in the dictionary returned by the tokenizer under the key "attention_mask":

 ::

-    # Continuation of the previous script
-    sequence_a_dict = tokenizer.encode_plus(sequence_a, max_length=19, pad_to_max_length=True)
-
-    assert sequence_a_dict['input_ids'] == [101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
-    assert sequence_a_dict['attention_mask'] == [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    >>> padded_sequences["attention_mask"]
+    [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]

+.. _token-type-ids:

 Token Type IDs
--------------------------
+~~~~~~~~~~~~~~

 Some models' purpose is to do sequence classification or question answering. These require two different sequences to
-be encoded in the same input IDs. They are usually separated by special tokens, such as the classifier and separator
+be joined in a single "input_ids" entry, which usually is performed with the help of special tokens, such as the classifier (``[CLS]``) and separator (``[SEP]``)
 tokens. For example, the BERT model builds its two sequence input as such:

 ::

-    from transformers import BertTokenizer
-    tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+   >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]

-    # [CLS] SEQ_A [SEP] SEQ_B [SEP]
-
-    sequence_a = "HuggingFace is based in NYC"
-    sequence_b = "Where is HuggingFace based?"
-
-    encoded_sequence = tokenizer.encode(sequence_a, sequence_b)
-    assert tokenizer.decode(encoded_sequence) == "[CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]"
-
-This is enough for some models to understand where one sequence ends and where another begins. However, other models
-such as BERT have an additional mechanism, which are the segment IDs. The Token Type IDs are a binary mask identifying
-the different sequences in the model.
-
-We can leverage :func:`~transformers.PreTrainedTokenizer.encode_plus` to output the Token Type IDs for us:
+We can use our tokenizer to automatically generate such a sentence by passing the two sequences to ``tokenizer`` as two arguments (and
+not a list, like before) like this:

 ::

-    # Continuation of the previous script
-    encoded_dict = tokenizer.encode_plus(sequence_a, sequence_b)
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
+    >>> sequence_a = "HuggingFace is based in NYC"
+    >>> sequence_b = "Where is HuggingFace based?"

-    assert encoded_dict['input_ids'] == [101, 20164, 10932, 2271, 7954, 1110, 1359, 1107, 17520, 102, 2777, 1110, 20164, 10932, 2271, 7954, 1359, 136, 102]
-    assert encoded_dict['token_type_ids'] == [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+    >>> encoded_dict = tokenizer(sequence_a, sequence_b)
+    >>> decoded = tokenizer.decode(encoded_dict["input_ids"])

-The first sequence, the "context" used for the question, has all its tokens represented by :obj:`0`, whereas the
-question has all its tokens represented by :obj:`1`. Some models, like :class:`~transformers.XLNetModel` use an
-additional token represented by a :obj:`2`.
+which will return:

+::
+
+    >>> print(decoded)
+    [CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]
+
+This is enough for some models to understand where one sequence ends and where another begins. However, other models,
+such as BERT, also deploy token type IDs (also called segment IDs). They are represented as a binary
+mask identifying the two types of sequence in the model.
+
+The tokenizer returns this mask as the "token_type_ids" entry:
+
+::
+
+    >>> encoded_dict['token_type_ids']
+    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]
+
+The first sequence, the "context" used for the question, has all its tokens represented by a :obj:`0`, whereas the
+second sequence, corresponding to the "question", has all its tokens represented by a :obj:`1`.
+
+Some models, like :class:`~transformers.XLNetModel` use an additional token represented by a :obj:`2`.
+
+.. _position-ids:

 Position IDs
--------------------------
+~~~~~~~~~~~~

-The position IDs are used by the model to identify which token is at which position. Contrary to RNNs that have the
-position of each token embedded within them, transformers are unaware of the position of each token. The position
-IDs are created for this purpose.
+Contrary to RNNs that have the position of each token embedded within them,
+transformers are unaware of the position of each token. Therefore, the position IDs (``position_ids``) are used by the model to identify each token's position in the list of tokens.

-They are an optional parameter. If no position IDs are passed to the model, they are automatically created as absolute
+They are an optional parameter. If no ``position_ids`` is passed to the model, the IDs are automatically created as absolute
 positional embeddings.

 Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models
 use other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.

+.. _feed-forward-chunking:

 Feed Forward Chunking
--------------------------
+~~~~~~~~~~~~~~~~~~~~~

-In transformers two feed forward layers usually follows the self attention layer in each residual attention block. The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (*e.g.* for ``bert-base-uncased``). 
+In each residual attention block in transformers the self-attention layer is usually followed by 2 feed forward layers.
+The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g.,
+for ``bert-base-uncased``).

-For an input of size ``[batch_size, sequence_length]``, the memory required to store the intermediate feed forward embeddings ``[batch_size, sequence_length, config.intermediate_size]`` can account for a large fraction of the memory use. The authors of `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_ noticed that since the computation is independent of the ``sequence_length`` dimension, it is mathematically equivalent to compute the output embeddings of both feed forward layers ``[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n``  individually and concat them afterward to ``[batch_size, sequence_length, config.hidden_size]`` with ``n = sequence_length``, which trades increased computation time against reduced memory use, but yields a mathematically **equivalent** result.
+For an input of size ``[batch_size, sequence_length]``, the memory required to store the intermediate feed forward
+embeddings ``[batch_size, sequence_length, config.intermediate_size]`` can account for a large fraction of the memory
+use. The authors of `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_ noticed that since the
+computation is independent of the ``sequence_length`` dimension, it is mathematically equivalent to compute the output
+embeddings of both feed forward layers ``[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n``
+individually and concat them afterward to ``[batch_size, sequence_length, config.hidden_size]`` with
+``n = sequence_length``, which trades increased computation time against reduced memory use, but yields a
+mathematically **equivalent** result.

-For models employing the function :func:`~.transformers.apply_chunking_to_forward`, the ``chunk_size`` defines the number of output embeddings that are computed in parallel and thus defines the trade-off between memory and time complexity. 
-If ``chunk_size`` is set to 0, no feed forward chunking is done.
+For models employing the function :func:`~.transformers.apply_chunking_to_forward`, the ``chunk_size`` defines the
+number of output embeddings that are computed in parallel and thus defines the trade-off between memory and time
+complexity.  If ``chunk_size`` is set to 0, no feed forward chunking is done.
--- a/docs/source/imgs/local_attention_mask.png
+++ b/docs/source/imgs/local_attention_mask.png
--- a/docs/source/imgs/ppl_chunked.gif
+++ b/docs/source/imgs/ppl_chunked.gif
--- a/docs/source/imgs/ppl_full.gif
+++ b/docs/source/imgs/ppl_full.gif
--- a/docs/source/imgs/ppl_sliding.gif
+++ b/docs/source/imgs/ppl_sliding.gif
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -1,17 +1,18 @@
 Transformers
 ================================================================================================================================================

-🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose architectures
-(BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural Language Generation
-(NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between TensorFlow 2.0 and PyTorch.
+State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0.

-This is the documentation of our repository `transformers <https://github.com/huggingface/transformers>`__.
+🤗 Transformers (formerly known as `pytorch-transformers` and `pytorch-pretrained-bert`) provides general-purpose
+architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet...) for Natural Language Understanding (NLU) and Natural
+Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+TensorFlow 2.0 and PyTorch.
+
+This is the documentation of our repository `transformers <https://github.com/huggingface/transformers>`_.

 Features
 ---------------------------------------------------

- As easy to use as pytorch-transformers
- As powerful and concise as Keras
 - High performance on NLU and NLG tasks
 - Low barrier to entry for educators and practitioners

@@ -37,57 +38,167 @@ Choose the right framework for every part of a model's lifetime:
 Contents
 ---------------------------------

-The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and conversion utilities for the following models:
+The documentation is organized in five parts:

-1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-3. `GPT-2 <https://blog.openai.com/better-language-models>`_ (from OpenAI) released with the paper `Language Models are Unsupervised Multitask Learners <https://blog.openai.com/better-language-models>`_ by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
-7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
-9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
-11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-13. `FlauBERT <https://github.com/getalp/Flaubert>`_ (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+- **GET STARTED** contains a quick tour, the installation instructions and some useful information about our philosophy
+  and a glossary.
+- **USING 🤗 TRANSFORMERS** contains general tutorials on how to use the library.
+- **ADVANCED GUIDES** contains more advanced guides that are more specific to a given script or part of the library.
+- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general resarch in
+  transformers model
+- **PACKAGE REFERENCE** contains the documentation of each public class and function.
+
+The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and
+conversion utilities for the following models:
+
+1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep
+   Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei
+   Chang, Kenton Lee, and Kristina Toutanova.
+2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language
+   Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik
+   Narasimhan, Tim Salimans, and Ilya Sutskever.
+3. `GPT-2 <https://blog.openai.com/better-language-models>`_ (from OpenAI) released with the paper `Language Models are
+   Unsupervised Multitask Learners <https://blog.openai.com/better-language-models>`_ by Alec Radford, Jeffrey Wu,
+   Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever.
+4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper
+   `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by
+   Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, and Ruslan Salakhutdinov.
+5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized
+   Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang, Zihang
+   Dai, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, and Quoc V. Le.
+6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual
+   Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
+7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with
+   the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle
+   Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin
+   Stoyanov.
+8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together
+   with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter
+   <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut, and Thomas Wolf. The same method has been
+   applied to compress GPT2 into
+   `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
+9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the
+   paper `CTRL: A Conditional Transformer Language Model for Controllable Generation
+   <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar, Bryan McCann, Lav R. Varshney, Caiming Xiong,
+   and Richard Socher.
+10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université)
+    released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by
+    Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la
+    Clergerie, Djame Seddah, and Benoît Sagot.
+11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper
+    `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_
+    by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut.
+12. `T5 <https://github.com/google-research/text-to-text-transfer-transformer>`_ (from Google) released with the paper
+    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
+    <https://arxiv.org/abs/1910.10683>`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
+    Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu.
+13. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together
+    with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by
+    Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard
+    Grave, Myle Ott, Luke Zettlemoyer, and Veselin Stoyanov.
+14. `MMBT <https://github.com/facebookresearch/mmbt/>`_ (from Facebook), released together with the paper a `Supervised
+    Multimodal Bitransformers for Classifying Images and Text <https://arxiv.org/pdf/1909.02950.pdf>`_ by Douwe Kiela,
+    Suvrat Bhooshan, Hamed Firooz, and Davide Testuggine.
+15. `FlauBERT <https://github.com/getalp/Flaubert>`_ (from CNRS) released with the paper `FlauBERT: Unsupervised
+    Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_ by Hang Le, Loïc Vial, Jibril Frej,
+    Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, and
+    Didier Schwab.
+16. `BART <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_ (from Facebook) released with the paper
+    `BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension
+    <https://arxiv.org/pdf/1910.13461.pdf>`_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman
+    Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer.
+17. `ELECTRA <https://github.com/google-research/electra>`_ (from Google Research/Stanford University) released with
+    the paper `ELECTRA: Pre-training text encoders as discriminators rather than generators
+    <https://arxiv.org/abs/2003.10555>`_ by Kevin Clark, Minh-Thang Luong, Quoc V. Le, and Christopher D. Manning.
+18. `DialoGPT <https://github.com/microsoft/DialoGPT>`_ (from Microsoft Research) released with the paper `DialoGPT:
+    Large-Scale Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`_ by
+    Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu,
+    and Bill Dolan.
+19. `Reformer <https://github.com/google/trax/tree/master/trax/models/reformer>`_ (from Google Research) released with
+    the paper `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_ by Nikita Kitaev, Łukasz
+    Kaiser, and Anselm Levskaya.
+20. `MarianMT <https://marian-nmt.github.io/>`_ (developed by the Microsoft Translator Team) machine translation models
+    trained using `OPUS <http://opus.nlpl.eu/>`_ pretrained_models data by Jörg Tiedemann.
+21. `Longformer <https://github.com/allenai/longformer>`_ (from AllenAI) released with the paper `Longformer: The
+    Long-Document Transformer <https://arxiv.org/abs/2004.05150>`_ by Iz Beltagy, Matthew E. Peters, and Arman Cohan.
+22. `DPR <https://github.com/facebookresearch/DPR>`_ (from Facebook) released with the paper `Dense Passage Retrieval
+    for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`_ by Vladimir Karpukhin, Barlas Oğuz, Sewon
+    Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+23. `Pegasus <https://github.com/google-research/pegasus>`_ (from Google) released with the paper `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization
+    <https://arxiv.org/abs/1912.08777>`_ by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+24. `MBart <https://github.com/pytorch/fairseq/tree/master/examples/mbart>`_ (from Facebook) released with the paper  `Multilingual Denoising Pre-training for Neural Machine Translation <https://arxiv.org/abs/2001.08210>`_ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov,
+    Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+25. `LXMERT <https://github.com/airsplay/lxmert>`_ (from UNC Chapel Hill) released with the paper `LXMERT: Learning
+    Cross-Modality Encoder Representations from Transformers for Open-Domain Question
+    Answering <https://arxiv.org/abs/1908.07490>`_ by Hao Tan and Mohit Bansal.
+26. `Funnel Transformer <https://github.com/laiguokun/Funnel-Transformer>`_ (from CMU/Google Brain) released with the paper
+    `Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing
+    <https://arxiv.org/abs/2006.03236>`_ by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+27. `Bert For Sequence Generation <https://tfhub.dev/s?module-type=text-generation&subtype=module,placeholder>`_ (from Google) released with the paper
+    `Leveraging Pre-trained Checkpoints for Sequence Generation Tasks
+    <https://arxiv.org/abs/1907.12461>`_ by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+28. `LayoutLM <https://github.com/microsoft/unilm/tree/master/layoutlm>`_ (from Microsoft Research Asia) released with the paper
+    `LayoutLM: Pre-training of Text and Layout for Document Image Understanding
+    <https://arxiv.org/abs/1912.13318>`_ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+29. `Other community models <https://huggingface.co/models>`_, contributed by the `community
+    <https://huggingface.co/users>`_.

 .. toctree::
    :maxdepth: 2
-    :caption: Notes
+    :caption: Get started

+    quicktour
    installation
-    quickstart
+    philosophy
    glossary
-    pretrained_models
-    usage
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Using 🤗 Transformers
+
+    task_summary
+    model_summary
+    preprocessing
+    training
    model_sharing
+    tokenizer_summary
+    multilingual
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Advanced guides
+
+    pretrained_models
    examples
+    custom_datasets
    notebooks
-    serialization
    converting_tensorflow_models
    migration
-    bertology
-    torchscript
-    multilingual
-    benchmarks
+    contributing
+    testing
+    serialization

 .. toctree::
    :maxdepth: 2
-    :caption: Main classes
+    :caption: Research

-    main_classes/configuration
-    main_classes/model
-    main_classes/tokenizer
-    main_classes/pipelines
-    main_classes/optimizer_schedules
-    main_classes/processors
+    bertology
+    perplexity
+    benchmarks

 .. toctree::
    :maxdepth: 2
    :caption: Package Reference

+    main_classes/configuration
+    main_classes/output
+    main_classes/model
+    main_classes/tokenizer
+    main_classes/pipelines
+    main_classes/trainer
+    main_classes/optimizer_schedules
+    main_classes/processors
+    main_classes/logging
    model_doc/auto
    model_doc/encoderdecoder
    model_doc/bert
@@ -110,3 +221,16 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
    model_doc/reformer
    model_doc/marian
    model_doc/longformer
+    model_doc/retribert
+    model_doc/mobilebert
+    model_doc/dpr
+    model_doc/pegasus
+    model_doc/mbart
+    model_doc/fsmt
+    model_doc/funnel
+    model_doc/lxmert
+    model_doc/bertgeneration
+    model_doc/layoutlm
+    internal/modeling_utils
+    internal/tokenization_utils
+    internal/pipelines_utils
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -1,51 +1,102 @@
 # Installation

-Transformers is tested on Python 3.6+ and PyTorch 1.1.0
+🤗 Transformers is tested on Python 3.6+, and PyTorch 1.1.0+ or TensorFlow 2.0+.

-## With pip
+You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're
+unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/). Create a virtual environment with the version of Python you're going 
+to use and activate it.

-PyTorch Transformers can be installed using pip as follows:
+Now, if you want to use 🤗 Transformers, you can install it with pip. If you'd like to play with the examples, you
+must install it from source.

-``` bash
+## Installation with pip
+
+First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) 
+and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific 
+install command for your platform.
+
+When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
+
+```bash
 pip install transformers
 ```

-## From source
+Alternatively, for CPU-support only, you can install 🤗 Transformers and PyTorch in one line with:

-To install from source, clone the repository and install with:
+```bash
+pip install transformers[torch]
+```
+
+or 🤗 Transformers and TensorFlow 2.0 in one line with:
+
+```bash
+pip install transformers[tf-cpu]
+```
+
+To check 🤗 Transformers is properly installed, run the following command:
+
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
+```
+
+It should download a pretrained model then print something like
+
+```bash
+[{'label': 'NEGATIVE', 'score': 0.9991129040718079}]
+```
+
+(Note that TensorFlow will print additional stuff before that last statement.)
+
+## Installing from source
+
+To install from source, clone the repository and install with the following commands:

 ``` bash
 git clone https://github.com/huggingface/transformers.git
 cd transformers
-pip install .
+pip install -e .
 ```

-## Tests
+Again, you can run 

-An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
-
-Refer to the [contributing guide](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#tests) for details about running tests.
-
-## OpenAI GPT original tokenization workflow
-
-If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` and `SpaCy`:
-
-``` bash
-pip install spacy ftfy==4.4.3
-python -m spacy download en
+```bash
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
 ```

-If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
+to check 🤗 Transformers is properly installed.

-## Note on model downloads (Continuous Integration or large-scale deployments)
+## Caching models

-If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
+This library provides pretrained models that will be downloaded and cached locally. Unless you specify a location with
+`cache_dir=...` when you use methods like `from_pretrained`, these models will automatically be downloaded in the
+folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the PyTorch
+cache home followed by ``/transformers/`` (even if you don't have PyTorch installed). This is (by order of priority):
+
+  * shell environment variable ``TORCH_HOME``
+  * shell environment variable ``XDG_CACHE_HOME`` + ``/torch/``
+  * default: ``~/.cache/torch/``
+
+So if you don't have any specific environment variable set, the cache directory will be at
+``~/.cache/torch/transformers/``.
+
+**Note:** If you have set a shell enviromnent variable for one of the predecessors of this library
+(``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
+enviromnent variable for ``TRANSFORMERS_CACHE``.
+
+### Note on model downloads (Continuous Integration or large-scale deployments)
+
+If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through
+your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way
+faster, and cheaper. Feel free to contact us privately if you need any help.

 ## Do you want to run a Transformer model on a mobile device?

 You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.

-It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
+It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, 
+`DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.

-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
-or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
+At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch or
+TensorFlow 2.0 to productizing them in CoreML, or prototype a model or an app in CoreML then research its
+hyperparameters or architecture from PyTorch or TensorFlow 2.0. Super exciting!
--- a/docs/source/internal/modeling_utils.rst
+++ b/docs/source/internal/modeling_utils.rst
@@ -0,0 +1,88 @@
+Custom Layers and Utilities
+---------------------------
+
+This page lists all the custom layers used by the library, as well as the utility functions it provides for modeling.
+
+Most of those are only useful if you are studying the code of the models in the library.
+
+
+``Pytorch custom modules``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_utils.Conv1D
+
+.. autoclass:: transformers.modeling_utils.PoolerStartLogits
+    :members: forward
+
+.. autoclass:: transformers.modeling_utils.PoolerEndLogits
+    :members: forward
+
+.. autoclass:: transformers.modeling_utils.PoolerAnswerClass
+    :members: forward
+
+.. autoclass:: transformers.modeling_utils.SquadHeadOutput
+
+.. autoclass:: transformers.modeling_utils.SQuADHead
+    :members: forward
+
+.. autoclass:: transformers.modeling_utils.SequenceSummary
+    :members: forward
+
+
+``PyTorch Helper Functions``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.apply_chunking_to_forward
+
+.. autofunction:: transformers.modeling_utils.find_pruneable_heads_and_indices
+
+.. autofunction:: transformers.modeling_utils.prune_layer
+
+.. autofunction:: transformers.modeling_utils.prune_conv1d_layer
+
+.. autofunction:: transformers.modeling_utils.prune_linear_layer
+
+``TensorFlow custom layers``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_utils.TFConv1D
+
+.. autoclass:: transformers.modeling_tf_utils.TFSharedEmbeddings
+    :members: call
+
+.. autoclass:: transformers.modeling_tf_utils.TFSequenceSummary
+    :members: call
+
+
+``TensorFlow loss functions``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_utils.TFCausalLanguageModelingLoss
+    :members:
+
+.. autoclass:: transformers.modeling_tf_utils.TFMaskedLanguageModelingLoss
+    :members:
+
+.. autoclass:: transformers.modeling_tf_utils.TFMultipleChoiceLoss
+    :members:
+
+.. autoclass:: transformers.modeling_tf_utils.TFQuestionAnsweringLoss
+    :members:
+
+.. autoclass:: transformers.modeling_tf_utils.TFSequenceClassificationLoss
+    :members:
+
+.. autoclass:: transformers.modeling_tf_utils.TFTokenClassificationLoss
+    :members:
+
+
+``TensorFlow Helper Functions``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.modeling_tf_utils.cast_bool_to_primitive
+
+.. autofunction:: transformers.modeling_tf_utils.get_initializer
+
+.. autofunction:: transformers.modeling_tf_utils.keras_serializable
+
+.. autofunction:: transformers.modeling_tf_utils.shape_list
--- a/docs/source/internal/pipelines_utils.rst
+++ b/docs/source/internal/pipelines_utils.rst
@@ -0,0 +1,40 @@
+Utilities for pipelines
+-----------------------
+
+This page lists all the utility functions the library provides for pipelines.
+
+Most of those are only useful if you are studying the code of the models in the library.
+
+
+Argument handling
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.pipelines.ArgumentHandler
+
+.. autoclass:: transformers.pipelines.ZeroShotClassificationArgumentHandler
+
+.. autoclass:: transformers.pipelines.QuestionAnsweringArgumentHandler
+
+
+Data format
+~~~~~~~~~~~
+
+.. autoclass:: transformers.pipelines.PipelineDataFormat
+    :members:
+
+.. autoclass:: transformers.pipelines.CsvPipelineDataFormat
+    :members:
+
+.. autoclass:: transformers.pipelines.JsonPipelineDataFormat
+    :members:
+
+.. autoclass:: transformers.pipelines.PipedPipelineDataFormat
+    :members:
+
+
+Utilities
+~~~~~~~~~
+
+.. autofunction:: transformers.pipelines.get_framework
+
+.. autoclass:: transformers.pipelines.PipelineException
--- a/docs/source/internal/tokenization_utils.rst
+++ b/docs/source/internal/tokenization_utils.rst
@@ -0,0 +1,38 @@
+Utilities for Tokenizers
+------------------------
+
+This page lists all the utility functions used by the tokenizers, mainly the class
+:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` that implements the common methods between
+:class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` and the mixin
+:class:`~transformers.tokenization_utils_base.SpecialTokensMixin`.
+
+Most of those are only useful if you are studying the code of the tokenizers in the library.
+
+``PreTrainedTokenizerBase``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.tokenization_utils_base.PreTrainedTokenizerBase
+    :special-members: __call__
+    :members:
+
+
+``SpecialTokensMixin``
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.tokenization_utils_base.SpecialTokensMixin
+    :members:
+
+
+Enums and namedtuples
+~~~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.tokenization_utils_base.ExplicitEnum
+
+.. autoclass:: transformers.tokenization_utils_base.PaddingStrategy
+
+.. autoclass:: transformers.tokenization_utils_base.TensorType
+
+.. autoclass:: transformers.tokenization_utils_base.TruncationStrategy
+
+.. autoclass:: transformers.tokenization_utils_base.CharSpan
+
+.. autoclass:: transformers.tokenization_utils_base.TokenSpan
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@@ -1,10 +1,13 @@
 Configuration
 ----------------------------------------------------

-The base class ``PretrainedConfig`` implements the common methods for loading/saving a configuration either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
+The base class :class:`~transformers.PretrainedConfig` implements the common methods for loading/saving a configuration
+either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded
+from HuggingFace's AWS S3 repository).

-``PretrainedConfig``
-~~~~~~~~~~~~~~~~~~~~~
+
+PretrainedConfig
+~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PretrainedConfig
    :members:
--- a/docs/source/main_classes/logging.rst
+++ b/docs/source/main_classes/logging.rst
@@ -0,0 +1,52 @@
+Logging
+-------
+
+🤗 Transformers has a centralized logging system, so that you can setup the verbosity of the library easily.
+
+Currently the default verbosity of the library is ``WARNING``.
+
+To change the level of verbosity, just use one of the direct setters. For instance, here is how to change the verbosity to the INFO level.
+
+.. code-block:: python
+
+    import transformers
+    transformers.logging.set_verbosity_info()
+
+You can also use the environment variable ``TRANSFORMERS_VERBOSITY`` to override the default verbosity. You can set it to one of the following: ``debug``, ``info``, ``warning``, ``error``, ``critical``. For example:
+
+.. code-block:: bash
+               
+    TRANSFORMERS_VERBOSITY=error ./myprogram.py
+
+All the methods of this logging module are documented below, the main ones are
+:func:`transformers.logging.get_verbosity` to get the current level of verbosity in the logger and
+:func:`transformers.logging.set_verbosity` to set the verbosity to the level of your choice. In order (from the least
+verbose to the most verbose), those levels (with their corresponding int values in parenthesis) are:
+
+- :obj:`transformers.logging.CRITICAL` or :obj:`transformers.logging.FATAL` (int value, 50): only report the most
+  critical errors.
+- :obj:`transformers.logging.ERROR` (int value, 40): only report errors.
+- :obj:`transformers.logging.WARNING` or :obj:`transformers.logging.WARN` (int value, 30): only reports error and
+  warnings. This the default level used by the library.
+- :obj:`transformers.logging.INFO` (int value, 20): reports error, warnings and basic information.
+- :obj:`transformers.logging.DEBUG` (int value, 10): report all information.
+
+Base setters
+~~~~~~~~~~~~
+
+.. autofunction:: transformers.logging.set_verbosity_error
+
+.. autofunction:: transformers.logging.set_verbosity_warning
+
+.. autofunction:: transformers.logging.set_verbosity_info
+
+.. autofunction:: transformers.logging.set_verbosity_debug
+
+Other functions
+~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.logging.get_verbosity
+
+.. autofunction:: transformers.logging.set_verbosity
+
+.. autofunction:: transformers.logging.get_logger
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -1,23 +1,34 @@
 Models
 ----------------------------------------------------

-The base class ``PreTrainedModel`` implements the common methods for loading/saving a model either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
+The base classes :class:`~transformers.PreTrainedModel` and :class:`~transformers.TFPreTrainedModel` implement the
+common methods for loading/saving a model either from a local file or directory, or from a pretrained model
+configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).

-``PreTrainedModel`` also implements a few methods which are common among all the models to:
+:class:`~transformers.PreTrainedModel` and :class:`~transformers.TFPreTrainedModel` also implement a few methods which
+are common among all the models to:

 - resize the input token embeddings when new tokens are added to the vocabulary
 - prune the attention heads of the model.

+The other methods that are common to each model are defined in :class:`~transformers.modeling_utils.ModuleUtilsMixin`
+(for the PyTorch models) and :class:`~transformers.modeling_tf_utils.TFModuleUtilsMixin` (for the TensorFlow models) or
+for text generation, :class:`~transformers.generation_utils.GenerationMixin` (for the PyTorch models) and
+:class:`~transformers.generation_tf_utils.TFGenerationMixin` (for the TensorFlow models)
+
+
 ``PreTrainedModel``
 ~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PreTrainedModel
    :members:

-``Helper Functions``
-~~~~~~~~~~~~~~~~~~~~~

-.. autofunction:: transformers.apply_chunking_to_forward
+``ModuleUtilsMixin``
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_utils.ModuleUtilsMixin
+    :members:


 ``TFPreTrainedModel``
@@ -25,3 +36,20 @@ The base class ``PreTrainedModel`` implements the common methods for loading/sav

 .. autoclass:: transformers.TFPreTrainedModel
    :members:
+
+
+``TFModelUtilsMixin``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_utils.TFModelUtilsMixin
+    :members:
+
+
+Generative models
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.generation_utils.GenerationMixin
+    :members:
+
+.. autoclass:: transformers.generation_tf_utils.TFGenerationMixin
+    :members:
--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -1,4 +1,4 @@
-Optimizer
+Optimization
 ----------------------------------------------------

 The ``.optimization`` module provides:
@@ -7,25 +7,30 @@ The ``.optimization`` module provides:
 - several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
 - a gradient accumulation class to accumulate the gradients of multiple batches

-``AdamW``
-~~~~~~~~~~~~~~~~
+``AdamW`` (PyTorch)
+~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AdamW
    :members:

-``AdamWeightDecay``
-~~~~~~~~~~~~~~~~~~~
+``AdaFactor`` (PyTorch)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Adafactor
+
+``AdamWeightDecay`` (TensorFlow)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AdamWeightDecay
-    :members:

 .. autofunction:: transformers.create_optimizer

 Schedules
----------------------------------------------------
+~~~~~~~~~~~~~~~~~~~
+
+Learning Rate Schedules (Pytorch)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Learning Rate Schedules
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 .. autofunction:: transformers.get_constant_schedule


@@ -57,16 +62,16 @@ Learning Rate Schedules
    :target: /imgs/warmup_linear_schedule.png
    :alt:

-``Warmup``
-~~~~~~~~~~~~~~~~
+``Warmup`` (TensorFlow)
+^^^^^^^^^^^^^^^^^^^^^^^

 .. autoclass:: transformers.WarmUp
    :members:

 Gradient Strategies
----------------------------------------------------
+~~~~~~~~~~~~~~~~~~~~

-``GradientAccumulator``
-~~~~~~~~~~~~~~~~~~~~~~~
+``GradientAccumulator`` (TensorFlow)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 .. autoclass:: transformers.GradientAccumulator
--- a/docs/source/main_classes/output.rst
+++ b/docs/source/main_classes/output.rst
@@ -0,0 +1,141 @@
+Model outputs
+-------------
+
+PyTorch models have outputs that are instances of subclasses of :class:`~transformers.file_utils.ModelOutput`. Those
+are data structures containing all the information returned by the model, but that can also be used as tuples or
+dictionaries.
+
+Let's see of this looks on an example:
+
+.. code-block::
+
+    from transformers import BertTokenizer, BertForSequenceClassification
+    import torch
+
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+    inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+    labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+    outputs = model(**inputs, labels=labels)
+
+The ``outputs`` object is a :class:`~transformers.modeling_outputs.SequenceClassifierOutput`, as we can see in the
+documentation of that class below, it means it has an optional ``loss``, a ``logits`` an optional ``hidden_states`` and
+an optional ``attentions`` attribute. Here we have the ``loss`` since we passed along ``labels``, but we don't have
+``hidden_states`` and ``attentions`` because we didn't pass ``output_hidden_states=True`` or
+``output_attentions=True``.
+
+You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
+will get ``None``. Here for instance ``outputs.loss`` is the loss computed by the model, and ``outputs.attentions`` is
+``None``.
+
+When considering our ``outputs`` object as tuple, it only considers the attributes that don't have ``None`` values.
+Here for instance, it has two elements, ``loss`` then ``logits``, so
+
+.. code-block::
+
+    outputs[:2]
+
+will return the tuple ``(outputs.loss, outputs.logits)`` for instance.
+
+When considering our ``outputs`` object as dictionary, it only considers the attributes that don't have ``None``
+values. Here for instance, it has two keys that are ``loss`` and ``logits``.
+
+We document here the generic model outputs that are used by more than one model type. Specific output types are
+documented on their corresponding model page.
+
+``ModelOutput``
+~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.file_utils.ModelOutput
+    :members:
+
+``BaseModelOutput``
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutput
+    :members:
+
+``BaseModelOutputWithPooling``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPooling
+    :members:
+
+``BaseModelOutputWithPast``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPast
+    :members:
+
+``Seq2SeqModelOutput``
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.Seq2SeqModelOutput
+    :members:
+
+``CausalLMOutput``
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.CausalLMOutput
+    :members:
+
+``CausalLMOutputWithPast``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.CausalLMOutputWithPast
+    :members:
+
+``MaskedLMOutput``
+~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.MaskedLMOutput
+    :members:
+
+``Seq2SeqLMOutput``
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.Seq2SeqLMOutput
+    :members:
+
+``NextSentencePredictorOutput``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.NextSentencePredictorOutput
+    :members:
+
+``SequenceClassifierOutput``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.SequenceClassifierOutput
+    :members:
+
+``Seq2SeqSequenceClassifierOutput``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.Seq2SeqSequenceClassifierOutput
+    :members:
+
+``MultipleChoiceModelOutput``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.MultipleChoiceModelOutput
+    :members:
+
+``TokenClassifierOutput``
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.TokenClassifierOutput
+    :members:
+
+``QuestionAnsweringModelOutput``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.QuestionAnsweringModelOutput
+    :members:
+
+``Seq2SeqQuestionAnsweringModelOutput``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.Seq2SeqQuestionAnsweringModelOutput
+    :members:
--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@@ -3,13 +3,25 @@ Pipelines

 The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most
 of the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity
-Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering.
+Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering. See the
+:doc:`task summary <../task_summary>` for examples of use.

 There are two categories of pipeline abstractions to be aware about:

- The :class:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines
- The other task-specific pipelines, such as :class:`~transformers.NerPipeline`
-  or :class:`~transformers.QuestionAnsweringPipeline`
+- The :func:`~transformers.pipeline` which is the most powerful object encapsulating all other pipelines.
+- The other task-specific pipelines:
+
+    - :class:`~transformers.ConversationalPipeline`
+    - :class:`~transformers.FeatureExtractionPipeline`
+    - :class:`~transformers.FillMaskPipeline`
+    - :class:`~transformers.QuestionAnsweringPipeline`
+    - :class:`~transformers.SummarizationPipeline`
+    - :class:`~transformers.TextClassificationPipeline`
+    - :class:`~transformers.TextGenerationPipeline`
+    - :class:`~transformers.TokenClassificationPipeline`
+    - :class:`~transformers.TranslationPipeline`
+    - :class:`~transformers.ZeroShotClassificationPipeline`
+    - :class:`~transformers.Text2TextGenerationPipeline`

 The pipeline abstraction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -17,58 +29,92 @@ The pipeline abstraction
 The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any
 other pipeline but requires an additional argument which is the `task`.

-.. autoclass:: transformers.pipeline
-    :members:
+.. autofunction:: transformers.pipeline


 The task specific pipelines
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Parent class: Pipeline
-=========================================
-
-.. autoclass:: transformers.Pipeline
-    :members: predict, transform, save_pretrained
-
-NerPipeline
+ConversationalPipeline
 ==========================================

-.. autoclass:: transformers.NerPipeline
+.. autoclass:: transformers.Conversation

-TokenClassificationPipeline
-==========================================
-
-This class is an alias of the :class:`~transformers.NerPipeline` defined above. Please refer to that pipeline for
-documentation and usage examples.
-
-FillMaskPipeline
-==========================================
-
-.. autoclass:: transformers.FillMaskPipeline
+.. autoclass:: transformers.ConversationalPipeline
+    :special-members: __call__
+    :members:

 FeatureExtractionPipeline
 ==========================================

 .. autoclass:: transformers.FeatureExtractionPipeline
+    :special-members: __call__
+    :members:

-TextClassificationPipeline
+FillMaskPipeline
 ==========================================

-.. autoclass:: transformers.TextClassificationPipeline
+.. autoclass:: transformers.FillMaskPipeline
+    :special-members: __call__
+    :members:
+
+NerPipeline
+==========================================
+
+This class is an alias of the :class:`~transformers.TokenClassificationPipeline` defined below. Please refer to that
+pipeline for documentation and usage examples.

 QuestionAnsweringPipeline
 ==========================================

 .. autoclass:: transformers.QuestionAnsweringPipeline
-
+    :special-members: __call__
+    :members:

 SummarizationPipeline
 ==========================================

 .. autoclass:: transformers.SummarizationPipeline
+    :special-members: __call__
+    :members:

+TextClassificationPipeline
+==========================================
+
+.. autoclass:: transformers.TextClassificationPipeline
+    :special-members: __call__
+    :members:

 TextGenerationPipeline
 ==========================================

 .. autoclass:: transformers.TextGenerationPipeline
+    :special-members: __call__
+    :members:
+
+Text2TextGenerationPipeline
+==========================================
+
+.. autoclass:: transformers.Text2TextGenerationPipeline
+    :special-members: __call__
+    :members:
+
+TokenClassificationPipeline
+==========================================
+
+.. autoclass:: transformers.TokenClassificationPipeline
+    :special-members: __call__
+    :members:
+
+ZeroShotClassificationPipeline
+==========================================
+
+.. autoclass:: transformers.ZeroShotClassificationPipeline
+    :special-members: __call__
+    :members:
+
+Parent class: :obj:`Pipeline`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Pipeline
+    :members:
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -1,38 +1,59 @@
 Tokenizer
 ----------------------------------------------------

-A tokenizer is in charge of preparing the inputs for a model. The library comprise tokenizers for all the models. Most of the tokenizers are available in two flavors: a full python implementation and a "Fast" implementation based on the Rust library `tokenizers`. The "Fast" implementations allows (1) a significant speed-up in particular when doing batched tokenization and (2) additional methods to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token). Currently no "Fast" implementation is available for the SentencePiece-based tokenizers (for T5, ALBERT, CamemBERT, XLMRoBERTa and XLNet models).
+A tokenizer is in charge of preparing the inputs for a model. The library contains tokenizers for all the models. Most
+of the tokenizers are available in two flavors: a full python implementation and a "Fast" implementation based on the
+Rust library `tokenizers <https://github.com/huggingface/tokenizers>`__. The "Fast" implementations allows:

-The base classes ``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` implements the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and "Fast" tokenizers either from a local file or directory or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository).
+1. a significant speed-up in particular when doing batched tokenization and
+2. additional methods to map between the original string (character and words) and the token space (e.g. getting the
+   index of the token comprising a given character or the span of characters corresponding to a given token). Currently
+   no "Fast" implementation is available for the SentencePiece-based tokenizers (for T5, ALBERT, CamemBERT, XLMRoBERTa
+   and XLNet models).

-``PreTrainedTokenizer`` and ``PreTrainedTokenizerFast`` thus implements the main methods for using all the tokenizers:
+The base classes :class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast`
+implement the common methods for encoding string inputs in model inputs (see below) and instantiating/saving python and
+"Fast" tokenizers either from a local file or directory or from a pretrained tokenizer provided by the library
+(downloaded from HuggingFace's AWS S3 repository). They both rely on
+:class:`~transformers.tokenization_utils_base.PreTrainedTokenizerBase` that contains the common methods, and
+:class:`~transformers.tokenization_utils_base.SpecialTokensMixin`.

- tokenizing (spliting strings in sub-word token strings), converting tokens strings to ids and back, and encoding/decoding (i.e. tokenizing + convert to integers),
- adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...),
- managing special tokens like mask, beginning-of-sentence, etc tokens (adding them, assigning them to attributes in the tokenizer for easy access and making sure they are not split during tokenization)
+:class:`~transformers.PreTrainedTokenizer` and :class:`~transformers.PreTrainedTokenizerFast` thus implement the main
+methods for using all the tokenizers:
+
+- Tokenizing (splitting strings in sub-word token strings), converting tokens strings to ids and back, and
+  encoding/decoding (i.e., tokenizing and converting to integers).
+- Adding new tokens to the vocabulary in a way that is independent of the underlying structure (BPE, SentencePiece...).
+- Managing special tokens (like mask, beginning-of-sentence, etc.): adding them, assigning them to attributes in the
+  tokenizer for easy access and making sure they are not split during tokenization.
+
+:class:`~transformers.BatchEncoding` holds the output of the tokenizer's encoding methods (``__call__``,
+``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python
+tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by these
+methods (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e., backed by HuggingFace
+`tokenizers library <https://github.com/huggingface/tokenizers>`__), this class provides in addition several advanced
+alignment methods which can be used to map between the original string (character and words) and the token space (e.g.,
+getting the index of the token comprising a given character or the span of characters corresponding to a given token).

-``BatchEncoding`` holds the output of the tokenizer's encoding methods (``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python tokenizer, this class behave just like a standard python dictionary and hold the various model inputs computed by these methodes (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e. backed by HuggingFace tokenizers library), this class provides in addition several advanced alignement methods which can be used to map between the original string (character and words) and the token space (e.g. getting the index of the token comprising a given character or the span of characters corresponding to a given token).

 ``PreTrainedTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PreTrainedTokenizer
+    :special-members: __call__
    :members:

+
 ``PreTrainedTokenizerFast``
-~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.PreTrainedTokenizerFast
+    :special-members: __call__
    :members:

+
 ``BatchEncoding``
 ~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BatchEncoding
    :members:
-
-``SpecialTokensMixin``
-~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.SpecialTokensMixin
-    :members:
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -0,0 +1,75 @@
+Trainer
+----------
+
+The :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` classes provide an API for feature-complete
+training in most standard use cases. It's used in most of the :doc:`example scripts <../examples>`.
+
+Before instantiating your :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`, create a 
+:class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments` to access all the points of
+customization during training.
+
+The API supports distributed training on multiple GPUs/TPUs, mixed precision through `NVIDIA Apex
+<https://github.com/NVIDIA/apex>`__ for PyTorch and :obj:`tf.keras.mixed_precision` for TensorFlow.
+
+Both :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` contain the basic training loop supporting the
+previous features. To inject custom behavior you can subclass them and override the following methods:
+
+- **get_train_dataloader**/**get_train_tfdataset** -- Creates the training DataLoader (PyTorch) or TF Dataset.
+- **get_eval_dataloader**/**get_eval_tfdataset** -- Creates the evaulation DataLoader (PyTorch) or TF Dataset.
+- **get_test_dataloader**/**get_test_tfdataset** -- Creates the test DataLoader (PyTorch) or TF Dataset.
+- **log** -- Logs information on the various objects watching training.
+- **setup_wandb** -- Setups wandb (see `here <https://docs.wandb.com/huggingface>`__ for more information).
+- **create_optimizer_and_scheduler** -- Setups the optimizer and learning rate scheduler if they were not passed at
+  init.
+- **compute_loss** - Computes the loss on a batch of training inputs.
+- **training_step** -- Performs a training step.
+- **prediction_step** -- Performs an evaluation/test step.
+- **run_model** (TensorFlow only) -- Basic pass through the model.
+- **evaluate** -- Runs an evaluation loop and returns metrics.
+- **predict** -- Returns predictions (with metrics if labels are available) on a test set.
+
+Here is an example of how to customize :class:`~transformers.Trainer` using a custom loss function:
+
+.. code-block:: python
+
+    from transformers import Trainer
+    class MyTrainer(Trainer):
+        def compute_loss(self, model, inputs):
+            labels = inputs.pop("labels")
+            outputs = models(**inputs)
+            logits = outputs[0]
+            return my_custom_loss(logits, labels)
+
+
+``Trainer`` 
+~~~~~~~~~~~
+
+.. autoclass:: transformers.Trainer
+    :members:
+
+``TFTrainer`` 
+~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTrainer
+    :members:
+
+``TrainingArguments``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrainingArguments
+    :members:
+
+``TFTrainingArguments``
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTrainingArguments
+    :members:
+
+Utilities
+~~~~~~~~~
+
+.. autoclass:: transformers.EvalPrediction
+
+.. autofunction:: transformers.set_seed
+
+.. autofunction:: transformers.torch_distributed_zero_first
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -1,8 +1,8 @@
 # Migrating from previous packages

-## Migrating from pytorch-transformers to transformers
+## Migrating from pytorch-transformers to 🤗 Transformers

-Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
+Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to 🤗 Transformers.

 ### Positional order of some models' keywords inputs (`attention_mask`, `token_type_ids`...) changed

@@ -14,17 +14,17 @@ If you used to call the models with positional inputs for keyword arguments, e.g

 ## Migrating from pytorch-pretrained-bert

-Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to `transformers`
+Here is a quick summary of what you should take care of when migrating from `pytorch-pretrained-bert` to 🤗 Transformers

 ### Models always output `tuples`

-The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to 🤗 Transformers is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.

 The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).

 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.

-Here is a `pytorch-pretrained-bert` to `transformers` conversion example for a `BertForSequenceClassification` classification model:
+Here is a `pytorch-pretrained-bert` to 🤗 Transformers conversion example for a `BertForSequenceClassification` classification model:

 ```python
 # Let's load our model
@@ -33,11 +33,11 @@ model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 # If you used to have this line in pytorch-pretrained-bert:
 loss = model(input_ids, labels=labels)

-# Now just use this line in transformers to extract the loss from the output tuple:
+# Now just use this line in 🤗 Transformers to extract the loss from the output tuple:
 outputs = model(input_ids, labels=labels)
 loss = outputs[0]

-# In transformers you can also have access to the logits:
+# In 🤗 Transformers you can also have access to the logits:
 loss, logits = outputs[:2]

 # And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
@@ -109,7 +109,7 @@ for batch in train_data:
    loss.backward()
    optimizer.step()

-### In Transformers, optimizer and schedules are splitted and instantiated like this:
+### In 🤗 Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler
 ### and used like this:
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -47,6 +47,16 @@ AlbertTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+Albert specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_albert.AlbertForPreTrainingOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_albert.TFAlbertForPreTrainingOutput
+    :members:
+
+
 AlbertModel
 ~~~~~~~~~~~~~~~~~~~~

@@ -54,6 +64,13 @@ AlbertModel
    :members:


+AlbertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForPreTraining
+    :members:
+
+
 AlbertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -68,6 +85,20 @@ AlbertForSequenceClassification
    :members:


+AlbertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForMultipleChoice
+    :members:
+
+
+AlbertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertForTokenClassification
+    :members:
+
+
 AlbertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -82,6 +113,13 @@ TFAlbertModel
    :members:


+TFAlbertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForPreTraining
+    :members:
+
+
 TFAlbertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -103,6 +141,13 @@ TFAlbertForMultipleChoice
    :members:


+TFAlbertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAlbertForTokenClassification
+    :members:
+
+
 TFAlbertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -1,65 +1,131 @@
-AutoModels
+AutoClasses
 -----------

-In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method.
+In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you
+are supplying to the :obj:`from_pretrained()` method.
+AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path
+to the pretrained weights/config/vocabulary.

-AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary:
-
-Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``).
+Instantiating one of :class:`~transformers.AutoConfig`, :class:`~transformers.AutoModel`, and
+:class:`~transformers.AutoTokenizer` will directly create a class of the relevant architecture. For instance


-``AutoConfig``
-~~~~~~~~~~~~~~~~~~~~~
+.. code-block:: python
+
+    model = AutoModel.from_pretrained('bert-base-cased')
+
+will create a model that is an instance of :class:`~transformers.BertModel`.
+
+There is one class of :obj:`AutoModel` for each task, and for each backend (PyTorch or TensorFlow).
+
+
+AutoConfig
+~~~~~~~~~~

 .. autoclass:: transformers.AutoConfig
    :members:


-``AutoTokenizer``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+AutoTokenizer
+~~~~~~~~~~~~~

 .. autoclass:: transformers.AutoTokenizer
    :members:


-``AutoModel``
-~~~~~~~~~~~~~~~~~~~~~
+AutoModel
+~~~~~~~~~

 .. autoclass:: transformers.AutoModel
    :members:


-``AutoModelForPreTraining``
-~~~~~~~~~~~~~~~~~~~~~
+AutoModelForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AutoModelForPreTraining
    :members:


-``AutoModelWithLMHead``
-~~~~~~~~~~~~~~~~~~~~~
+AutoModelWithLMHead
+~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AutoModelWithLMHead
    :members:


-``AutoModelForSequenceClassification``
-~~~~~~~~~~~~~~~~~~~~~
+AutoModelForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AutoModelForSequenceClassification
    :members:


-``AutoModelForQuestionAnswering``
-~~~~~~~~~~~~~~~~~~~~~
+AutoModelForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForMultipleChoice
+    :members:
+
+
+AutoModelForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForTokenClassification
+    :members:
+
+
+AutoModelForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.AutoModelForQuestionAnswering
    :members:


-``AutoModelForTokenClassification``
-~~~~~~~~~~~~~~~~~~~~~
+TFAutoModel
+~~~~~~~~~~~

-.. autoclass:: transformers.AutoModelForTokenClassification
+.. autoclass:: transformers.TFAutoModel
    :members:

+
+TFAutoModelForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForPreTraining
+    :members:
+
+
+TFAutoModelWithLMHead
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelWithLMHead
+    :members:
+
+
+TFAutoModelForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForSequenceClassification
+    :members:
+
+
+TFAutoModelForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForMultipleChoice
+    :members:
+
+
+TFAutoModelForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForTokenClassification
+    :members:
+
+
+TFAutoModelForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/bart.rst
+++ b/docs/source/model_doc/bart.rst
@@ -4,8 +4,9 @@ Bart
 file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
@sshleifer

-Paper
-~~~~~
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The Bart model was `proposed <https://arxiv.org/abs/1910.13461>`_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
 According to the abstract,

@@ -18,11 +19,34 @@ The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/ma

 Implementation Notes
 ~~~~~~~~~~~~~~~~~~~~
+
 - Bart doesn't use :obj:`token_type_ids` for sequence classification. Use BartTokenizer.encode to get the proper splitting.
 - The forward pass of ``BartModel`` will create decoder inputs (using the helper function ``transformers.modeling_bart._prepare_bart_decoder_inputs``)  if they are not passed. This is different than some other modeling APIs.
 - Model predictions are intended to be identical to the original implementation. This only works, however, if the string you pass to ``fairseq.encode`` starts with a space.
 - ``BartForConditionalGeneration.generate`` should be used for conditional generation tasks like summarization, see the example in that docstrings
 - Models that load the ``"facebook/bart-large-cnn"`` weights will not have a ``mask_token_id``, or be able to perform mask filling tasks.
+- for training/forward passes that don't involve beam search, pass ``use_cache=False``
+
+
+BartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForConditionalGeneration
+    :members: forward
+
+
+BartConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartConfig
+    :members:
+
+
+BartTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartTokenizer
+    :members:



@@ -35,22 +59,17 @@ BartModel
 .. autofunction:: transformers.modeling_bart._prepare_bart_decoder_inputs


-BartForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.BartForConditionalGeneration
-    :members: generate, forward
-
-
 BartForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.BartForSequenceClassification
    :members: forward

-BartConfig
-~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.BartConfig
-    :members:
+BartForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForQuestionAnswering
+    :members: forward
+

--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -27,13 +27,8 @@ Tips:

 - BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
  the right rather than the left.
- BERT was trained with a masked language modeling (MLM) objective. It is therefore efficient at predicting masked
-  tokens and at NLU in general, but is not optimal for text generation. Models trained with a causal language
-  modeling (CLM) objective are better in that regard.
- Alongside MLM, BERT was trained using a next sentence prediction (NSP) objective using the [CLS] token as a sequence
-  approximate. The user may use this token (the first token in a sequence built with special tokens) to get a sequence
-  prediction rather than a token prediction. However, averaging over the sequence may yield better results than using
-  the [CLS] token.
+- BERT was trained with the masked language modeling (MLM) and next sentence prediction (NSP) objectives. It is efficient at predicting masked
+  tokens and at NLU in general, but is not optimal for text generation.

 The original code can be found `here <https://github.com/google-research/bert>`_.

@@ -59,6 +54,16 @@ BertTokenizerFast
    :members:


+Bert specific outputs
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_bert.BertForPreTrainingOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_bert.TFBertForPreTrainingOutput
+    :members:
+
+
 BertModel
 ~~~~~~~~~~~~~~~~~~~~

@@ -73,6 +78,13 @@ BertForPreTraining
    :members:


+BertModelLMHeadModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertLMHeadModel
+    :members:
+
+
 BertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -129,6 +141,13 @@ TFBertForPreTraining
    :members:


+TFBertModelLMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBertLMHeadModel
+    :members:
+
+
 TFBertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/bertgeneration.rst
+++ b/docs/source/model_doc/bertgeneration.rst
@@ -0,0 +1,82 @@
+BertGeneration
+----------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+The BertGeneration model is a BERT model that can be leveraged for sequence-to-sequence tasks using :class:`~transformers.EncoderDecoderModel` as proposed in `Leveraging Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+
+The abstract from the paper is the following:
+
+*Unsupervised pre-training of large neural models has recently revolutionized Natural Language Processing. By warm-starting from the publicly released checkpoints, NLP practitioners have pushed the state-of-the-art on multiple benchmarks while saving significant amounts of compute time. So far the focus has been mainly on the Natural Language Understanding tasks. In this paper, we demonstrate the efficacy of pre-trained checkpoints for Sequence Generation. We developed a Transformer-based sequence-to-sequence model that is compatible with publicly available pre-trained BERT, GPT-2 and RoBERTa checkpoints and conducted an extensive empirical study on the utility of initializing our model, both encoder and decoder, with these checkpoints. Our models result in new state-of-the-art results on Machine Translation, Text Summarization, Sentence Splitting, and Sentence Fusion.*
+
+Usage:
+
+- The model can be used in combination with the :class:`~transformers.EncoderDecoderModel` to leverage two bert pretrained bert checkpoints for subsequent fine-tuning.
+
+::
+  
+  # leverage checkpoints for Bert2Bert model...
+  encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)  # use BERT's cls token as BOS token and sep token as EOS token
+  decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)  # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
+  bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
+  
+  # create tokenizer...
+  tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")
+
+  input_ids = tokenizer('This is a long article to summarize', add_special_tokens=False, return_tensors="pt").input_ids
+  labels = tokenizer('This is a short summary', return_tensors="pt").input_ids
+
+  # train...
+  loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels, return_dict=True).loss
+  loss.backward()
+
+
+- Pretrained :class:`~transformers.EncoderDecoderModel` are also directly available in the model hub, *e.g.*:
+
+
+::
+
+  # instantiate sentence fusion model
+  sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
+  tokenizer = AutoTokenizer.from_pretrained("google/roberta2roberta_L-24_discofuse")
+
+  input_ids = tokenizer('This is the first sentence. This is the second sentence.', add_special_tokens=False, return_tensors="pt").input_ids
+
+  outputs = sentence_fuser.generate(input_ids)
+
+  print(tokenizer.decode(outputs[0]))
+
+
+Tips:
+
+- :class:`~transformers.BertGenerationEncoder` and :class:`~transformers.BertGenerationDecoder`  should be used in combination with :class:`~transformers.EncoderDecoder`.
+- For summarization, sentence splitting, sentence fusion and translation, no special tokens are required for the input. Therefore, no EOS token should be added to the end of the input.
+
+The original code can be found `here <https://tfhub.dev/s?module-type=text-generation&subtype=module,placeholder>`__.
+
+BertGenerationConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertGenerationConfig
+    :members:
+
+
+BertGenerationTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertGenerationTokenizer
+    :members: 
+
+BertGenerationEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertGenerationEncoder
+    :members:
+
+
+BertGenerationDecoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertGenerationDecoder
+    :members:
--- a/docs/source/model_doc/camembert.rst
+++ b/docs/source/model_doc/camembert.rst
@@ -1,6 +1,9 @@
 CamemBERT
 ----------------------------------------------------

+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The CamemBERT model was proposed in `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`__
 by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
 Clergerie, Djamé Seddah, and Benoît Sagot. It is based on Facebook's RoBERTa model released in 2019. It is a model
@@ -46,6 +49,13 @@ CamembertModel
    :members:


+CamembertForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertForCausalLM
+    :members:
+
+
 CamembertForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -74,6 +84,13 @@ CamembertForTokenClassification
    :members:


+CamembertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertForQuestionAnswering
+    :members:
+
+
 TFCamembertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -95,8 +112,22 @@ TFCamembertForSequenceClassification
    :members:


+TFCamembertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCamembertForMultipleChoice
+    :members:
+
+
 TFCamembertForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFCamembertForTokenClassification
    :members:
+
+
+TFCamembertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCamembertForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@@ -1,6 +1,9 @@
 CTRL
 ----------------------------------------------------

+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`_
 by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -1,6 +1,9 @@
 DistilBERT
 ----------------------------------------------------

+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The DistilBERT model was proposed in the blog post
 `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`__,
 and the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__.
@@ -72,6 +75,20 @@ DistilBertForSequenceClassification
    :members:


+DistilBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DistilBertForMultipleChoice
+    :members:
+
+
+DistilBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DistilBertForTokenClassification
+    :members:
+
+
 DistilBertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -99,6 +116,22 @@ TFDistilBertForSequenceClassification
    :members:


+
+TFDistilBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDistilBertForMultipleChoice
+    :members:
+
+
+
+TFDistilBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDistilBertForTokenClassification
+    :members:
+
+
 TFDistilBertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/dpr.rst
+++ b/docs/source/model_doc/dpr.rst
@@ -0,0 +1,102 @@
+DPR
+----------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+Dense Passage Retrieval (DPR) - is a set of tools and models for state-of-the-art open-domain Q&A research.
+It is based on the following paper:
+
+Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih, Dense Passage Retrieval for Open-Domain Question Answering.
+
+The abstract from the paper is the following:
+
+*Open-domain question answering relies on efficient passage retrieval to select candidate contexts, where traditional
+sparse vector space models, such as TF-IDF or BM25, are the de facto method. In this work, we show that retrieval can
+be practically implemented using dense representations alone, where embeddings are learned from a small number of
+questions and passages by a simple dual-encoder framework. When evaluated on a wide range of open-domain QA datasets,
+our dense retriever outperforms a strong Lucene-BM25 system largely by 9%-19% absolute in terms of top-20 passage
+retrieval accuracy, and helps our end-to-end QA system establish new state-of-the-art on multiple open-domain QA
+benchmarks.*
+
+The original code can be found `here <https://github.com/facebookresearch/DPR>`_.
+
+
+DPRConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRConfig
+    :members:
+
+
+DPRContextEncoderTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRContextEncoderTokenizer
+    :members:
+
+
+DPRContextEncoderTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRContextEncoderTokenizerFast
+    :members:
+
+DPRQuestionEncoderTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRQuestionEncoderTokenizer
+    :members:
+
+
+DPRQuestionEncoderTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRQuestionEncoderTokenizerFast
+    :members:
+
+DPRReaderTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRReaderTokenizer
+    :members:
+
+
+DPRReaderTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRReaderTokenizerFast
+    :members:
+
+
+DPR specific outputs
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_dpr.DPRContextEncoderOutput
+    :members:
+
+.. autoclass:: transformers.modeling_dpr.DPRQuestionEncoderOutput
+    :members:
+
+.. autoclass:: transformers.modeling_dpr.DPRReaderOutput
+    :members:
+
+
+DPRContextEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRContextEncoder
+    :members:
+
+DPRQuestionEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRQuestionEncoder
+    :members:
+
+
+DPRReader
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DPRReader
+    :members:
--- a/docs/source/model_doc/electra.rst
+++ b/docs/source/model_doc/electra.rst
@@ -1,6 +1,9 @@
 ELECTRA
 ----------------------------------------------------

+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The ELECTRA model was proposed in the paper.
 `ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators <https://openreview.net/pdf?id=r1xMH1BtvB>`__.
 ELECTRA is a new pre-training approach which trains two transformer models: the generator and the discriminator. The
@@ -68,6 +71,16 @@ ElectraTokenizerFast
    :members:


+Electra specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_electra.ElectraForPreTrainingOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_electra.TFElectraForPreTrainingOutput
+    :members:
+
+
 ElectraModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -89,6 +102,20 @@ ElectraForMaskedLM
    :members:


+ElectraForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForSequenceClassification
+    :members:
+
+
+ElectraForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForMultipleChoice
+    :members:
+
+
 ElectraForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -96,6 +123,13 @@ ElectraForTokenClassification
    :members:


+ElectraForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ElectraForQuestionAnswering
+    :members:
+
+
 TFElectraModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -117,8 +151,29 @@ TFElectraForMaskedLM
    :members:


+TFElectraForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForSequenceClassification
+    :members:
+
+
+TFElectraForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForMultipleChoice
+    :members:
+
+
 TFElectraForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFElectraForTokenClassification
    :members:
+
+
+TFElectraForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFElectraForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/encoderdecoder.rst
+++ b/docs/source/model_doc/encoderdecoder.rst
@@ -1,16 +1,17 @@
 Encoder Decoder Models
-----------
+------------------------

-This class can wrap an encoder model, such as ``BertModel`` and a decoder modeling with a language modeling head, such as ``BertForMaskedLM`` into a encoder-decoder model.
+The :class:`~transformers.EncoderDecoderModel` can be used to initialize a sequence-to-sequence model with any pre-trained autoencoding model as the encoder and any pre-trained autoregressive model as the decoder.

-The ``EncoderDecoderModel`` class allows to instantiate a encoder decoder model using the ``from_encoder_decoder_pretrain`` class method taking a pretrained encoder and pretrained decoder model as an input. 
-The ``EncoderDecoderModel`` is saved using the standard ``save_pretrained()`` method and can also again be loaded using the standard ``from_pretrained()`` method. 
+The effectiveness of initializing sequence-to-sequence models with pre-trained checkpoints for sequence generation tasks was shown in `Leveraging Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.

-An application of this architecture could be *summarization* using two pretrained Bert models as is shown in the paper: `Text Summarization with Pretrained Encoders <https://arxiv.org/abs/1910.13461>`_ by Yang Liu and Mirella Lapata. 
+After such an :class:`~transformers.EncoderDecoderModel` has been trained / fine-tuned, it can be saved / loaded just like any other models (see Examples for more information).
+
+An application of this architecture could be to leverage two pre-trained :obj:`transformers.BertModel` models as the encoder and decoder for a summarization model as was shown in: `Text Summarization with Pretrained Encoders <https://arxiv.org/abs/1908.08345>`_ by Yang Liu and Mirella Lapata. 


 ``EncoderDecoderConfig``
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.EncoderDecoderConfig
    :members:
--- a/docs/source/model_doc/flaubert.rst
+++ b/docs/source/model_doc/flaubert.rst
@@ -1,6 +1,9 @@
 FlauBERT
 ----------------------------------------------------

+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The FlauBERT model was proposed in the paper
 `FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le et al.
 It's a transformer pre-trained using a masked language modeling (MLM) objective (BERT-like).
@@ -58,6 +61,20 @@ FlaubertForSequenceClassification
    :members:


+FlaubertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertForMultipleChoice
+    :members:
+
+
+FlaubertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaubertForTokenClassification
+    :members:
+
+
 FlaubertForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -72,3 +89,43 @@ FlaubertForQuestionAnswering
    :members:


+TFFlaubertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertModel
+    :members:
+
+
+TFFlaubertWithLMHeadModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertWithLMHeadModel
+    :members:
+
+
+TFFlaubertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForSequenceClassification
+    :members:
+
+
+TFFlaubertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForMultipleChoice
+    :members:
+
+
+TFFlaubertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForTokenClassification
+    :members:
+
+
+TFFlaubertForQuestionAnsweringSimple
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFlaubertForQuestionAnsweringSimple
+    :members:
--- a/docs/source/model_doc/fsmt.rst
+++ b/docs/source/model_doc/fsmt.rst
@@ -0,0 +1,49 @@
+FSMT
+----------------------------------------------------
+**DISCLAIMER:** If you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+@stas00.
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+FSMT (FairSeq MachineTranslation) models were introduced in "Facebook FAIR's WMT19 News Translation Task Submission" <this paper <https://arxiv.org/abs/1907.06616>__ by Nathan Ng, Kyra Yee, Alexei Baevski, Myle Ott, Michael Auli, Sergey Edunov.
+
+The abstract of the paper is the following:
+
+    This paper describes Facebook FAIR's submission to the WMT19 shared news translation task. We participate in two language pairs and four language directions, English <-> German and English <-> Russian. Following our submission from last year, our baseline systems are large BPE-based transformer models trained with the Fairseq sequence modeling toolkit which rely on sampled back-translations. This year we experiment with different bitext data filtering schemes, as well as with adding filtered back-translated data. We also ensemble and fine-tune our models on domain-specific data, then decode using noisy channel model reranking. Our submissions are ranked first in all four directions of the human evaluation campaign. On En->De, our system significantly outperforms other systems as well as human translations. This system improves upon our WMT'18 submission by 4.5 BLEU points.
+
+The original code can be found here <https://github.com/pytorch/fairseq/tree/master/examples/wmt19>__.
+
+Implementation Notes
+~~~~~~~~~~~~~~~~~~~~
+
+- FSMT uses source and target vocab pair, that aren't combined into one. It doesn't share embed tokens either. Its tokenizer is very similar to `XLMTokenizer` and the main model is derived from `BartModel`.
+
+
+FSMTForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FSMTForConditionalGeneration
+    :members: forward
+
+
+FSMTConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FSMTConfig
+    :members:
+
+
+FSMTTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FSMTTokenizer
+    :members:
+
+
+FSMTModel
+~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FSMTModel
+    :members: forward
--- a/docs/source/model_doc/funnel.rst
+++ b/docs/source/model_doc/funnel.rst
@@ -0,0 +1,185 @@
+Funnel Transformer
+------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+The Funnel Transformer model was proposed in the paper
+`Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing
+<https://arxiv.org/abs/2006.03236>`__.
+It is a bidirectional transformer model, like BERT, but with a pooling operation after each block of layers, a bit
+like in traditional convolutional neural networks (CNN) in computer vision.
+
+The abstract from the paper is the following:
+
+*With the success of language pretraining, it is highly desirable to develop more efficient architectures of good
+scalability that can exploit the abundant unlabeled data at a lower cost. To improve the efficiency, we examine the
+much-overlooked redundancy in maintaining a full-length token-level presentation, especially for tasks that only
+require a single-vector presentation of the sequence. With this intuition, we propose Funnel-Transformer which
+gradually compresses the sequence of hidden states to a shorter one and hence reduces the computation cost. More
+importantly, by re-investing the saved FLOPs from length reduction in constructing a deeper or wider model, we further
+improve the model capacity. In addition, to perform token-level predictions as required by common pretraining
+objectives, Funnel-Transformer is able to recover a deep representation for each token from the reduced hidden sequence
+via a decoder. Empirically, with comparable or fewer FLOPs, Funnel-Transformer outperforms the standard Transformer on
+a wide variety of sequence-level prediction tasks, including text classification, language understanding, and reading
+comprehension.*
+
+Tips:
+
+- Since Funnel Transformer uses pooling, the sequence length of the hidden states changes after each block of layers.
+  The base model therefore has a final sequence length that is a quarter of the original one. This model can be used
+  directly for tasks that just require a sentence summary (like sequence classification or multiple choice). For other
+  tasks, the full model is used; this full model has a decoder that upsamples the final hidden states to the same
+  sequence length as the input.
+- The Funnel Transformer checkpoints are all available with a full version and a base version. The first ones should
+  be used for :class:`~transformers.FunnelModel`, :class:`~transformers.FunnelForPreTraining`,
+  :class:`~transformers.FunnelForMaskedLM`, :class:`~transformers.FunnelForTokenClassification` and
+  class:`~transformers.FunnelForQuestionAnswering`. The second ones should be used for
+  :class:`~transformers.FunnelBaseModel`, :class:`~transformers.FunnelForSequenceClassification` and
+  :class:`~transformers.FunnelForMultipleChoice`.
+
+The original code can be found `here <https://github.com/laiguokun/Funnel-Transformer>`_.
+
+
+FunnelConfig
+~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelConfig
+    :members:
+
+
+FunnelTokenizer
+~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+FunnelTokenizerFast
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelTokenizerFast
+    :members:
+
+
+Funnel specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_funnel.FunnelForPreTrainingOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_funnel.TFFunnelForPreTrainingOutput
+    :members:
+
+
+FunnelBaseModel
+~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelBaseModel
+    :members:
+
+
+FunnelModel
+~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelModel
+    :members:
+
+
+FunnelModelForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelForPreTraining
+    :members:
+
+
+FunnelForMaskedLM
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelForMaskedLM
+    :members:
+
+
+FunnelForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelForSequenceClassification
+    :members:
+
+
+FunnelForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelForMultipleChoice
+    :members:
+
+
+FunnelForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelForTokenClassification
+    :members:
+
+
+FunnelForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FunnelForQuestionAnswering
+    :members:
+
+
+TFFunnelBaseModel
+~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelBaseModel
+    :members:
+
+
+TFFunnelModel
+~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelModel
+    :members:
+
+
+TFFunnelModelForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelForPreTraining
+    :members:
+
+
+TFFunnelForMaskedLM
+~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelForMaskedLM
+    :members:
+
+
+TFFunnelForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelForSequenceClassification
+    :members:
+
+
+TFFunnelForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelForMultipleChoice
+    :members:
+
+
+TFFunnelForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelForTokenClassification
+    :members:
+
+
+TFFunnelForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFFunnelForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -38,6 +38,17 @@ Hugging Face showcasing the generative capabilities of several models. GPT is on

 The original code can be found `here <https://github.com/openai/finetune-transformer-lm>`_.

+Note:
+
+If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install 
+``ftfy`` and ``SpaCy``::
+
+    pip install spacy ftfy==4.4.3
+    python -m spacy download en
+
+If you don't install ``ftfy`` and ``SpaCy``, the :class:`transformers.OpenAIGPTTokenizer` will default to tokenize using 
+BERT's :obj:`BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't 
+worry).

 OpenAIGPTConfig
 ~~~~~~~~~~~~~~~~~~~~~
@@ -60,6 +71,16 @@ OpenAIGPTTokenizerFast
    :members:


+OpenAI specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_openai.OpenAIGPTDoubleHeadsModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput
+    :members:
+
+
 OpenAIGPTModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -58,6 +58,16 @@ GPT2TokenizerFast
    :members:


+GPT2 specific outputs
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_gpt2.GPT2DoubleHeadsModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_gpt2.TFGPT2DoubleHeadsModelOutput
+    :members:
+
+
 GPT2Model
 ~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/layoutlm.rst
+++ b/docs/source/model_doc/layoutlm.rst
@@ -0,0 +1,55 @@
+LayoutLM
+----------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+The LayoutLM model was proposed in `LayoutLM: Pre-training of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__
+by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. It's a simple but effective pre-training method 
+of text and layout for document image understanding and information extraction tasks, such as form understanding and receipt understanding.
+
+The abstract from the paper is the following:
+
+*Pre-training techniques have been verified successfully in a variety of NLP tasks in recent years. Despite the widespread use of pre-training models for NLP applications, they almost exclusively focus on text-level manipulation, while neglecting layout and style information that is vital for document image understanding. In this paper, we propose the \textbf{LayoutLM} to jointly model interactions between text and layout information across scanned document images, which is beneficial for a great number of real-world document image understanding tasks such as information extraction from scanned documents. Furthermore, we also leverage image features to incorporate words' visual information into LayoutLM. To the best of our knowledge, this is the first time that text and layout are jointly learned in a single framework for document-level pre-training. It achieves new state-of-the-art results in several downstream tasks, including form understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image classification (from 93.07 to 94.42).*
+
+Tips:
+
+- LayoutLM has an extra input called :obj:`bbox`, which is the bounding boxes of the input tokens.
+- The :obj:`bbox` requires the data that on 0-1000 scale, which means you should normalize the bounding box before passing them into model.
+
+The original code can be found `here <https://github.com/microsoft/unilm/tree/master/layoutlm>`_.
+
+
+LayoutLMConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMConfig
+    :members:
+
+
+LayoutLMTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMTokenizer
+    :members:
+
+
+LayoutLMModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMModel
+    :members:
+
+
+LayoutLMForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMForMaskedLM
+    :members:
+
+
+LayoutLMForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMForTokenClassification
+    :members:
--- a/docs/source/model_doc/longformer.rst
+++ b/docs/source/model_doc/longformer.rst
@@ -4,7 +4,7 @@ Longformer
 file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`_

 Overview
-~~~~~
+~~~~~~~~~
 The Longformer model was presented in `Longformer: The Long-Document Transformer <https://arxiv.org/pdf/2004.05150.pdf>`_ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
 Here the abstract: 

@@ -13,10 +13,10 @@ Here the abstract:
 The Authors' code can be found `here <https://github.com/allenai/longformer>`_ .

 Longformer Self Attention
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~
 Longformer self attention employs self attention on both a "local" context and a "global" context.
 Most tokens only attend "locally" to each other meaning that each token attends to its :math:`\frac{1}{2} w` previous tokens and :math:`\frac{1}{2} w` succeding tokens with :math:`w` being the window length as defined in `config.attention_window`. Note that `config.attention_window` can be of type ``list`` to define a different :math:`w` for each layer. 
-A selecetd few tokens attend "globally" to all other tokens, as it is conventionally done for all tokens in *e.g.* `BertSelfAttention`.
+A selected few tokens attend "globally" to all other tokens, as it is conventionally done for all tokens in *e.g.* `BertSelfAttention`.

 Note that "locally" and "globally" attending tokens are projected by different query, key and value matrices.
 Also note that every "locally" attending token not only attends to tokens within its window :math:`w`, but also to all "globally" attending tokens so that global attention is *symmetric*.
@@ -55,6 +55,13 @@ LongformerTokenizer
    :members: 


+LongformerTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerTokenizerFast
+    :members: 
+
+
 LongformerModel
 ~~~~~~~~~~~~~~~~~~~~

@@ -69,10 +76,10 @@ LongformerForMaskedLM
    :members:


-LongformerForQuestionAnswering
+LongformerForSequenceClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.LongformerForQuestionAnswering
+.. autoclass:: transformers.LongformerForSequenceClassification
    :members:


@@ -89,3 +96,31 @@ LongformerForTokenClassification
 .. autoclass:: transformers.LongformerForTokenClassification
    :members:

+
+LongformerForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LongformerForQuestionAnswering
+    :members:
+
+
+TFLongformerModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerModel
+    :members:
+
+
+TFLongformerForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerForMaskedLM
+    :members:
+
+
+TFLongformerForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerForQuestionAnswering
+    :members:
+
--- a/docs/source/model_doc/lxmert.rst
+++ b/docs/source/model_doc/lxmert.rst
@@ -0,0 +1,109 @@
+LXMERT
+----------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+The LXMERT model was proposed in `LXMERT: Learning Cross-Modality Encoder Representations from Transformers <https://arxiv.org/abs/1908.07490>`__
+by Hao Tan & Mohit Bansal. It is a series of bidirectional transformer encoders (one for the vision modality, one for the language modality, and then one to fuse both modalities)
+pre-trained using a combination of masked language modeling, visual-language text alignment, ROI-feature regression, masked visual-attribute modeling, masked visual-object modeling, and visual-question answering objectives.
+The pretraining consists of multiple multi-modal datasets: MSCOCO, Visual-Genome + Visual-Genome Question Answering, VQA 2.0, and GQA.
+
+The abstract from the paper is the following:
+
+*Vision-and-language reasoning requires an understanding of visual concepts, language semantics, and, most importantly, the alignment and relationships between these two
+modalities. We thus propose the LXMERT
+(Learning Cross-Modality Encoder Representations from Transformers) framework to learn
+these vision-and-language connections. In
+LXMERT, we build a large-scale Transformer
+model that consists of three encoders: an object relationship encoder, a language encoder,
+and a cross-modality encoder. Next, to endow our model with the capability of connecting vision and language semantics, we
+pre-train the model with large amounts of
+image-and-sentence pairs, via five diverse representative pre-training tasks: masked language modeling, masked object prediction
+(feature regression and label classification),
+cross-modality matching, and image question answering. These tasks help in learning both intra-modality and cross-modality relationships. After fine-tuning from our pretrained parameters, our model achieves the
+state-of-the-art results on two visual question answering datasets (i.e., VQA and GQA).
+We also show the generalizability of our pretrained cross-modality model by adapting it to
+a challenging visual-reasoning task, NLVR
+,
+and improve the previous best result by 22%
+absolute (54% to 76%). Lastly, we demonstrate detailed ablation studies to prove that
+both our novel model components and pretraining strategies significantly contribute to
+our strong results; and also present several
+attention visualizations for the different encoders*
+
+Tips:
+
+- Bounding boxes are not necessary to be used in the visual feature embeddings, any kind of visual-spacial features will work.
+- Both the language hidden states and the visual hidden states that LXMERT outputs are passed through the cross-modality layer, so they
+  contain information from both modalities. To access a modality that only attends to itself, select the vision/language hidden states from the first input in the tuple.
+- The bi-directional cross-modality encoder attention only returns attention values when the language modality is used as the input and the vision modality is used as the context vector. Further,
+  while the cross-modality encoder contains self-attention for each respective modality and cross-attention, only the cross attention is returned and both self attention outputs are disregarded.
+
+The code can be found `here <https://github.com/airsplay/lxmert>`__
+
+
+LxmertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LxmertConfig
+    :members:
+
+
+LxmertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LxmertTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+Lxmert specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_lxmert.LxmertModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_lxmert.LxmertForPreTrainingOutput
+    :members:
+
+.. autoclass:: transformers.modeling_lxmert.LxmertForQuestionAnsweringOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_lxmert.TFLxmertModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_lxmert.TFLxmertForPreTrainingOutput
+    :members:
+
+
+LxmertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LxmertModel
+    :members:
+
+LxmertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LxmertForPreTraining
+    :members:
+
+LxmertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LxmertForQuestionAnswering
+    :members:
+
+
+TFLxmertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLxmertModel
+    :members:
+
+TFLxmertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLxmertForPreTraining
+    :members:
--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -1,16 +1,16 @@
 MarianMT
 ----------------------------------------------------
-**DISCLAIMER:** If you see something strange,
-file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+**Bugs:** If you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title>`__ and assign
@sshleifer. Translations should be similar, but not identical to, output in the test set linked to in each model card.

 Implementation Notes
 ~~~~~~~~~~~~~~~~~~~~
- each model is about 298 MB on disk, there are 1,000+ models.
+- Each model is about 298 MB on disk, there are 1,000+ models.
 - The list of supported language pairs can be found `here <https://huggingface.co/Helsinki-NLP>`__.
- The 1,000+ models were originally trained by `Jörg Tiedemann <https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann>`__ using the `Marian <https://marian-nmt.github.io/>`_ C++ library, which supports fast training and translation.
+- models were originally trained by `Jörg Tiedemann <https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann>`__ using the `Marian <https://marian-nmt.github.io/>`_ C++ library, which supports fast training and translation.
 - All models are transformer encoder-decoders with 6 layers in each component. Each model's performance is documented in a model card.
- the 80 opus models that require BPE preprocessing are not supported.
+- The 80 opus models that require BPE preprocessing are not supported.
 - The modeling code is the same as ``BartForConditionalGeneration`` with a few minor modifications:
    - static (sinusoid) positional embeddings (``MarianConfig.static_position_embeddings=True``)
    - a new final_logits_bias (``MarianConfig.add_bias_logits=True``)
@@ -48,7 +48,7 @@ Example of translating english to many romance languages, using language codes:
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    print(tokenizer.supported_language_codes)
    model = MarianMTModel.from_pretrained(model_name)
-    translated = model.generate(**tokenizer.prepare_translation_batch(src_text))
+    translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text))
    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
    # ["c'est une phrase en anglais que nous voulons traduire en français",
    # 'Isto deve ir para o português.',
@@ -92,9 +92,11 @@ MarianMTModel
 Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints.
 Model API is identical to BartForConditionalGeneration.
 Available models are listed at `Model List <https://huggingface.co/models?search=Helsinki-NLP>`__
-This class inherits all functionality from ``BartForConditionalGeneration``, see that page for method signatures.
+This class inherits nearly all functionality from ``BartForConditionalGeneration``, see that page for method signatures.

-.. autoclass:: transformers.MarianMTModel
+MarianConfig
+~~~~~~~~~~~~~~~~~~~
+.. autoclass:: transformers.MarianConfig
    :members:


@@ -102,4 +104,8 @@ MarianTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.MarianTokenizer
-    :members: prepare_translation_batch
+    :members: prepare_seq2seq_batch
+
+
+
+
--- a/docs/source/model_doc/mbart.rst
+++ b/docs/source/model_doc/mbart.rst
@@ -0,0 +1,76 @@
+MBart
+----------------------------------------------------
+**DISCLAIMER:** If you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+@sshleifer
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+The MBart model was presented in `Multilingual Denoising Pre-training for Neural Machine Translation <https://arxiv.org/abs/2001.08210>`_ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov
+Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. According to the abstract,
+
+MBART is a sequence-to-sequence denoising auto-encoder pre-trained on large-scale monolingual corpora in many languages using the BART objective. mBART is one of the first methods for pre-training a complete sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only on the encoder, decoder, or reconstructing parts of the text.
+
+The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/mbart>`__
+
+
+Training
+~~~~~~~~~~~~~~~~~~~~~
+MBart is a multilingual encoder-decoder (seq-to-seq) model primarily intended for translation task. 
+As the model is multilingual it expects the sequences in a different format. A special language id token 
+is added in both the source and target text. The source text format is ``X [eos, src_lang_code]`` 
+where ``X`` is the source text. The target text format is ```[tgt_lang_code] X [eos]```. ```bos``` is never used.
+The ```MBartTokenizer.prepare_seq2seq_batch``` handles this automatically and should be used to encode 
+the sequences for seq-2-seq fine-tuning.
+
+- Supervised training
+
+::
+
+    example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
+    expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
+    batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian)
+    input_ids = batch["input_ids"]
+    target_ids = batch["decoder_input_ids"]
+    decoder_input_ids = target_ids[:, :-1].contiguous()
+    labels = target_ids[:, 1:].clone()
+    model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, labels=labels) #forward
+
+- Generation
+
+    While generating the target text set the `decoder_start_token_id` to the target language id. 
+    The following example shows how to translate English to Romanian using the ```facebook/mbart-large-en-ro``` model.
+
+::
+
+    from transformers import MBartForConditionalGeneration, MBartTokenizer
+    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
+    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
+    article = "UN Chief Says There Is No Military Solution in Syria"
+    batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang="en_XX")
+    translated_tokens = model.generate(**batch, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
+    translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
+    assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria"
+
+
+MBartConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartConfig
+    :members:
+
+
+MBartTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartTokenizer
+    :members: build_inputs_with_special_tokens, prepare_seq2seq_batch
+
+
+MBartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartForConditionalGeneration
+    :members: generate, forward
+
+
--- a/docs/source/model_doc/mobilebert.rst
+++ b/docs/source/model_doc/mobilebert.rst
@@ -0,0 +1,179 @@
+MobileBERT
+----------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+The MobileBERT model was proposed in `MobileBERT: a Compact Task-Agnostic BERT
+for Resource-Limited Devices <https://arxiv.org/abs/2004.02984>`__
+by Zhiqing Sun, Hongkun Yu, Xiaodan Song, Renjie Liu, Yiming Yang, and Denny Zhou. It's a bidirectional transformer
+based on the BERT model, which is compressed and accelerated using several approaches.
+
+The abstract from the paper is the following:
+
+*Natural Language Processing (NLP) has recently achieved great success by using huge pre-trained models with hundreds
+of millions of parameters. However, these models suffer from heavy model sizes and high latency such that they cannot
+be deployed to resource-limited mobile devices. In this paper, we propose MobileBERT for compressing and accelerating
+the popular BERT model. Like the original BERT, MobileBERT is task-agnostic, that is, it can be generically applied
+to various downstream NLP tasks via simple fine-tuning. Basically, MobileBERT is a thin version of BERT_LARGE, while
+equipped with bottleneck structures and a carefully designed balance between self-attentions and feed-forward
+networks. To train MobileBERT, we first train a specially designed teacher model, an inverted-bottleneck incorporated
+BERT_LARGE model. Then, we conduct knowledge transfer from this teacher to MobileBERT. Empirical studies show that
+MobileBERT is 4.3x smaller and 5.5x faster than BERT_BASE while achieving competitive results on well-known
+benchmarks. On the natural language inference tasks of GLUE, MobileBERT achieves a GLUEscore o 77.7
+(0.6 lower than BERT_BASE), and 62 ms latency on a Pixel 4 phone. On the SQuAD v1.1/v2.0 question answering task,
+MobileBERT achieves a dev F1 score of 90.0/79.2 (1.5/2.1 higher than BERT_BASE).*
+
+Tips:
+
+- MobileBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
+  the right rather than the left.
+- MobileBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective.
+  It is therefore efficient at predicting masked tokens and at NLU in general, but is not optimal for
+  text generation. Models trained with a causal language modeling (CLM) objective are better in that regard.
+
+The original code can be found `here <https://github.com/google-research/mobilebert>`_.
+
+MobileBertConfig
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertConfig
+    :members:
+
+
+MobileBertTokenizer
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+MobileBertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertTokenizerFast
+    :members:
+
+
+MobileBert specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_mobilebert.MobileBertForPreTrainingOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_mobilebert.TFMobileBertForPreTrainingOutput
+    :members:
+
+
+MobileBertModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertModel
+    :members:
+
+
+MobileBertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertForPreTraining
+    :members:
+
+
+MobileBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertForMaskedLM
+    :members:
+
+
+MobileBertForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertForNextSentencePrediction
+    :members:
+
+
+MobileBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertForSequenceClassification
+    :members:
+
+
+MobileBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertForMultipleChoice
+    :members:
+
+
+MobileBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertForTokenClassification
+    :members:
+
+
+MobileBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MobileBertForQuestionAnswering
+    :members:
+
+
+TFMobileBertModel
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertModel
+    :members:
+
+
+TFMobileBertForPreTraining
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertForPreTraining
+    :members:
+
+
+TFMobileBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertForMaskedLM
+    :members:
+
+
+TFMobileBertForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertForNextSentencePrediction
+    :members:
+
+
+TFMobileBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertForSequenceClassification
+    :members:
+
+
+TFMobileBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertForMultipleChoice
+    :members:
+
+
+TFMobileBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertForTokenClassification
+    :members:
+
+
+TFMobileBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMobileBertForQuestionAnswering
+    :members:
+
--- a/docs/source/model_doc/pegasus.rst
+++ b/docs/source/model_doc/pegasus.rst
@@ -0,0 +1,117 @@
+Pegasus
+----------------------------------------------------
+**DISCLAIMER:** If you see something strange,
+file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title>`__ and assign
+@sshleifer.
+
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+The Pegasus model was proposed in `PEGASUS: Pre-training with Extracted Gap-sentences for
+Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`_ by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
+According to the abstract,
+
+- Pegasus' pretraining task is intentionally similar to summarization: important sentences are removed/masked from an input document and are generated together as one output sequence from the remaining sentences, similar to an extractive summary.
+- Pegasus achieves SOTA summarization performance on all 12 downstream tasks, as measured by ROUGE and human eval.
+
+The Authors' code can be found `here <https://github.com/google-research/pegasus>`_.
+
+
+Checkpoints
+~~~~~~~~~~~
+All the `checkpoints <https://huggingface.co/models?search=pegasus>`_ are finetuned for summarization, besides ``pegasus-large``, whence the other checkpoints are finetuned.
+- Each checkpoint is 2.2 GB on disk and 568M parameters.
+- FP16 is not supported (help/ideas on this appreciated!).
+- Summarizing xsum in fp32 takes about 400ms/sample, with default parameters on a v100 GPU.
+- For XSUM, The paper reports rouge1,rouge2, rougeL of paper: 47.21/24.56/39.25. As of Aug 9, this port scores 46.91/24.34/39.1.
+The gap is likely because of different alpha/length_penalty implementations in beam search.
+
+
+Implementation Notes
+~~~~~~~~~~~~~~~~~~~~
+
+- All models are transformer encoder-decoders with 16 layers in each component.
+- The implementation is completely inherited from ``BartForConditionalGeneration``
+- Some key configuration differences:
+    - static, sinusoidal position embeddings
+    - no ``layernorm_embedding`` (``PegasusConfig.normalize_embedding=False``)
+    - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix.
+    - ``num_beams=8``
+- All pretrained pegasus checkpoints are the same besides three attributes: ``tokenizer.model_max_length`` (max input size),  ``max_length`` (max num tokens to generate) and ``length_penalty``
+- Code to convert checkpoints trained in the author's `repo <https://github.com/google-research/pegasus>`_ can be found in ``convert_pegasus_tf_to_pytorch.py``
+
+
+Usage Example
+~~~~~~~~~~~~~~~~~~~~
+
+.. code-block:: python
+
+    from transformers import PegasusForConditionalGeneration, PegasusTokenizer
+    import torch
+    src_text = [
+        """ PG&E stated it scheduled the blackouts in response to forecasts for high winds amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."""
+    ]
+
+    model_name = 'google/pegasus-xsum'
+    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    tokenizer = PegasusTokenizer.from_pretrained(model_name)
+    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
+    batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest').to(torch_device)
+    translated = model.generate(**batch)
+    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
+    assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."
+
+PegasusForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This class inherits all functionality from ``BartForConditionalGeneration``, see that page for method signatures.
+Available models are listed at `Model List <https://huggingface.co/models?search=pegasus>`__
+
+.. autoclass:: transformers.PegasusForConditionalGeneration
+    :members:
+
+
+PegasusConfig
+~~~~~~~~~~~~~~~~~~~
+This config fully inherits from ``BartConfig``, but pegasus uses different default values:
+Up to date parameter values can be seen in `S3 <https://s3.amazonaws.com/models.huggingface.co/bert/google/pegasus-xsum/config.json>`_.
+As of Aug 10, 2020, they are:
+
+.. code-block:: python
+
+    dict(
+    vocab_size=96103,
+    max_position_embeddings=512,
+    d_model=1024,
+    encoder_ffn_dim=4096,
+    decoder_ffn_dim=4096,
+    encoder_attention_heads=16,
+    decoder_attention_heads=16,
+    encoder_layers=16,
+    decoder_layers=16,
+    dropout=0.1,
+    attention_dropout=0.1,
+    activation_dropout=0.1,
+    pad_token_id=0,
+    eos_token_id=1,
+    is_encoder_decoder=True,
+    normalize_before=True,
+    scale_embedding=True,
+    normalize_embedding=False,
+    add_final_layer_norm=True,
+    static_position_embeddings=True,
+    num_beams=8,
+    activation_function="relu",
+    )
+
+
+PegasusTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+warning: ``add_tokens`` does not work at the moment.
+
+.. autoclass:: transformers.PegasusTokenizer
+    :members: __call__, prepare_seq2seq_batch
+
+
+
--- a/docs/source/model_doc/reformer.rst
+++ b/docs/source/model_doc/reformer.rst
@@ -4,7 +4,7 @@ Reformer
 file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`_

 Overview
-~~~~~
+~~~~~~~~~~
 The Reformer model was presented in `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451.pdf>`_ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
 Here the abstract: 

@@ -13,7 +13,7 @@ Here the abstract:
 The Authors' code can be found `here <https://github.com/google/trax/tree/master/trax/models/reformer>`_ .

 Axial Positional Encodings
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 Axial Positional Encodings were first implemented in Google's `trax library <https://github.com/google/trax/blob/4d99ad4965bab1deba227539758d59f0df0fef48/trax/layers/research/position_encodings.py#L29>`_ and developed by the authors of this model's paper. In models that are treating very long input sequences, the conventional position id encodings store an embedings vector of size :math:`d` being the ``config.hidden_size`` for every position :math:`i, \ldots, n_s`, with :math:`n_s` being ``config.max_embedding_size``. *E.g.*, having a sequence length of :math:`n_s = 2^{19} \approx 0.5M` and a ``config.hidden_size`` of :math:`d = 2^{10} \approx 1000` would result in a position encoding matrix:

 .. math::
@@ -112,3 +112,24 @@ ReformerModelWithLMHead

 .. autoclass:: transformers.ReformerModelWithLMHead
    :members:
+
+
+ReformerForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerForMaskedLM
+    :members:
+
+
+ReformerForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerForSequenceClassification
+    :members:
+
+
+ReformerForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/retribert.rst
+++ b/docs/source/model_doc/retribert.rst
@@ -0,0 +1,39 @@
+RetriBERT
+----------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
+The RetriBERT model was proposed in the blog post
+`Explain Anything Like I'm Five: A Model for Open Domain Long Form Question Answering <https://yjernite.github.io/lfqa.html>`__,
+RetriBERT is a small model that uses either a single or pair of Bert encoders with lower-dimension projection for dense semantic indexing of text.
+
+Code to train and use the model can be found `here <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
+
+
+RetriBertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RetriBertConfig
+    :members:
+
+
+RetriBertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RetriBertTokenizer
+    :members:
+
+
+RetriBertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RetriBertTokenizerFast
+    :members:
+
+
+RetriBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RetriBertModel
+    :members:
--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -1,6 +1,9 @@
 RoBERTa
 ----------------------------------------------------

+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_
 by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
 Veselin Stoyanov. It is based on Google's BERT model released in 2018.
@@ -60,6 +63,13 @@ RobertaModel
    :members:


+RobertaForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RobertaForCausalLM
+    :members:
+
+
 RobertaForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -87,6 +97,14 @@ RobertaForTokenClassification
 .. autoclass:: transformers.RobertaForTokenClassification
    :members:

+
+RobertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.RobertaForQuestionAnswering
+    :members:
+
+
 TFRobertaModel
 ~~~~~~~~~~~~~~~~~~~~

@@ -108,8 +126,22 @@ TFRobertaForSequenceClassification
    :members:


+TFRobertaForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRobertaForMultipleChoice
+    :members:
+
+
 TFRobertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFRobertaForTokenClassification
    :members:
+
+
+TFRobertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFRobertaForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -4,7 +4,8 @@ T5
 file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`_

 Overview
-~~~~~
+~~~~~~~~~~~~~~~~~~~~~
+
 The T5 model was presented in `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer <https://arxiv.org/pdf/1910.10683.pdf>`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu in 
 Here the abstract: 

@@ -14,28 +15,38 @@ Our systematic study compares pre-training objectives, architectures, unlabeled
 By combining the insights from our exploration with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. 
 To facilitate future work on transfer learning for NLP, we release our dataset, pre-trained models, and code.*

-The Authors' code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`_ .
+Tips:
+
+- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised 
+  and supervised tasks and for which each task is converted into a text-to-text format.
+  T5 works well on a variety of tasks out-of-the-box by prepending a different prefix to the input corresponding to each task, e.g.: for translation: *translate English to German: ..., summarize: ...*.
+  For more information about which prefix to use, it is easiest to look into Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`_ .
+- For sequence to sequence generation, it is recommended to use ``T5ForConditionalGeneration.generate()``. The method takes care of feeding the encoded input via cross-attention layers to the decoder and auto-regressively generates the decoder output.
+- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
+
+The original code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`_.

 Training
-~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~
+
 T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher forcing.
 This means that for training we always need an input sequence and a target sequence. 
-The input sequence is fed to the model using ``input_ids``. The target sequence is shifted to the right, *i.e.* prepended by a start-sequence token and fed to the decoder using the `decoder_input_ids`. In teacher-forcing style, the target sequence is then appended by the EOS token and corresponds to the ``lm_labels``. The PAD token is hereby used as the start-sequence token.
+The input sequence is fed to the model using ``input_ids``. The target sequence is shifted to the right, *i.e.* prepended by a start-sequence token and fed to the decoder using the `decoder_input_ids`. In teacher-forcing style, the target sequence is then appended by the EOS token and corresponds to the ``labels``. The PAD token is hereby used as the start-sequence token.
 T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.

 - Unsupervised denoising training

  In this setup spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) 
  and the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. 
-  Each sentinel token represents a unique mask token for this sentence and should start with ``<extra_id_1>``, ``<extra_id_2>``, ... up to ``<extra_id_100>``. As a default 100 sentinel tokens are available in ``T5Tokenizer``.
+  Each sentinel token represents a unique mask token for this sentence and should start with ``<extra_id_0>``, ``<extra_id_1>``, ... up to ``<extra_id_99>``. As a default 100 sentinel tokens are available in ``T5Tokenizer``.
  *E.g.* the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be processed as follows: 

 ::

-  input_ids = tokenizer.encode('The <extra_id_1> walks in <extra_id_2> park', return_tensors='pt')
-  lm_labels = tokenizer.encode('<extra_id_1> cute dog <extra_id_2> the <extra_id_3> </s>', return_tensors='pt')
+  input_ids = tokenizer.encode('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt')
+  labels = tokenizer.encode('<extra_id_0> cute dog <extra_id_1> the <extra_id_2> </s>', return_tensors='pt')
  # the forward function automatically creates the correct decoder_input_ids
-  model(input_ids=input_ids, lm_labels=lm_labels)
+  model(input_ids=input_ids, labels=labels)

 - Supervised training

@@ -46,20 +57,9 @@ T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.
 ::

  input_ids = tokenizer.encode('translate English to German: The house is wonderful. </s>', return_tensors='pt')
-  lm_labels = tokenizer.encode('Das Haus ist wunderbar. </s>', return_tensors='pt')
+  labels = tokenizer.encode('Das Haus ist wunderbar. </s>', return_tensors='pt')
  # the forward function automatically creates the correct decoder_input_ids
-  model(input_ids=input_ids, lm_labels=lm_labels)
-
-Tips
-~~~~~~~~~~~~~~~~~~~~
- T5 is an encoder-decoder model pre-trained on a multi-task mixture of unsupervised 
-  and supervised tasks and for which each task is converted into a text-to-text format.
-  T5 works well on a variety of tasks out-of-the-box by prepending a different prefix to the input corresponding to each task, e.g.: for translation: *translate English to German: ..., summarize: ...*.
-  For more information about which prefix to use, it is easiest to look into Appendix D of the `paper <https://arxiv.org/pdf/1910.10683.pdf>`_ .
- For sequence to sequence generation, it is recommended to use ``T5ForConditionalGeneration.generate()``. The method takes care of feeding the encoded input via cross-attention layers to the decoder and auto-regressively generates the decoder output.
- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
-
-The original code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`_.
+  model(input_ids=input_ids, labels=labels)


 T5Config
@@ -99,7 +99,7 @@ TFT5Model


 TFT5ForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFT5ForConditionalGeneration
    :members:
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -54,6 +54,22 @@ TransfoXLTokenizerFast
    :members:


+TransfoXL specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_transfo_xl.TransfoXLModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_transfo_xl.TransfoXLLMHeadModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_transfo_xl.TFTransfoXLModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput
+    :members:
+
+
 TransfoXLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -46,6 +46,14 @@ XLMTokenizer
    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
        create_token_type_ids_from_sequences, save_vocabulary

+
+XLM specific outputs
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_xlm.XLMForQuestionAnsweringOutput
+    :members:
+
+
 XLMModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -67,6 +75,20 @@ XLMForSequenceClassification
    :members:


+XLMForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMForMultipleChoice
+    :members:
+
+
+XLMForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMForTokenClassification
+    :members:
+
+
 XLMForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -102,6 +124,21 @@ TFXLMForSequenceClassification
    :members:


+TFXLMForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMForMultipleChoice
+    :members:
+
+
+TFXLMForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMForTokenClassification
+    :members:
+
+
+
 TFXLMForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/xlmroberta.rst
+++ b/docs/source/model_doc/xlmroberta.rst
@@ -1,6 +1,9 @@
 XLM-RoBERTa
 ------------------------------------------

+Overview
+~~~~~~~~~~~~~~~~~~~~~
+
 The XLM-RoBERTa model was proposed in `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__
 by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán,
 Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019.
@@ -53,6 +56,13 @@ XLMRobertaModel
    :members:


+XLMRobertaForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMRobertaForCausalLM
+    :members:
+
+
 XLMRobertaForMaskedLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -81,6 +91,13 @@ XLMRobertaForTokenClassification
    :members:


+XLMRobertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMRobertaForQuestionAnswering
+    :members:
+
+
 TFXLMRobertaModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -102,8 +119,22 @@ TFXLMRobertaForSequenceClassification
    :members:


+TFXLMRobertaForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMRobertaForMultipleChoice
+    :members:
+
+
 TFXLMRobertaForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFXLMRobertaForTokenClassification
    :members:
+
+
+TFXLMRobertaForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLMRobertaForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -50,6 +50,49 @@ XLNetTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+XLNet specific outputs
+~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_xlnet.XLNetModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_xlnet.XLNetLMHeadModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_xlnet.XLNetForSequenceClassificationOutput
+    :members:
+
+.. autoclass:: transformers.modeling_xlnet.XLNetForMultipleChoiceOutput
+    :members:
+
+.. autoclass:: transformers.modeling_xlnet.XLNetForTokenClassificationOutput
+    :members:
+
+.. autoclass:: transformers.modeling_xlnet.XLNetForQuestionAnsweringSimpleOutput
+    :members:
+
+.. autoclass:: transformers.modeling_xlnet.XLNetForQuestionAnsweringOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetLMHeadModelOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput
+    :members:
+
+.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput
+    :members:
+
+
 XLNetModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -71,13 +114,6 @@ XLNetForSequenceClassification
    :members:


-XLNetForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.XLNetForTokenClassification
-    :members:
-
-
 XLNetForMultipleChoice
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -85,6 +121,13 @@ XLNetForMultipleChoice
    :members:


+XLNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLNetForTokenClassification
+    :members:
+
+
 XLNetForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -120,6 +163,20 @@ TFXLNetForSequenceClassification
    :members:


+TFLNetForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLNetForMultipleChoice
+    :members:
+
+
+TFXLNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFXLNetForTokenClassification
+    :members:
+
+
 TFXLNetForQuestionAnsweringSimple
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_sharing.md
+++ b/docs/source/model_sharing.md
@@ -1,55 +0,0 @@
-# Model upload and sharing
-
-Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
-
-**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Optionally, join an existing organization or create a new one. Then:
-
-```shell
-transformers-cli login
-# log in using the same credentials as on huggingface.co
-```
-Upload your model:
-```shell
-transformers-cli upload ./path/to/pretrained_model/
-
-# ^^ Upload folder containing weights/tokenizer/config
-# saved via `.save_pretrained()`
-
-transformers-cli upload ./config.json [--filename folder/foobar.json]
-
-# ^^ Upload a single file
-# (you can optionally override its filename, which can be nested inside a folder)
-```
-
-If you want your model to be namespaced by your organization name rather than your username, add the following flag to any command:
-```shell
--organization organization_name
-```
-
-Your model will then be accessible through its identifier, a concatenation of your username (or organization name) and the folder name above:
-```python
-"username/pretrained_model"
-# or if an org:
-"organization_name/pretrained_model"
-```
-
-**Please add a README.md model card** to the repo under `model_cards/` with: model description, training params (dataset, preprocessing, hardware used, hyperparameters), evaluation results, intended uses & limitations, etc.
-
-Your model now has a page on huggingface.co/models 🔥
-
-Anyone can load it from code:
-```python
-tokenizer = AutoTokenizer.from_pretrained("namespace/pretrained_model")
-model = AutoModel.from_pretrained("namespace/pretrained_model")
-```
-
-List all your files on S3:
-```shell
-transformers-cli s3 ls
-```
-
-You can also delete unneeded files:
-
-```shell
-transformers-cli s3 rm …
-```
--- a/docs/source/model_sharing.rst
+++ b/docs/source/model_sharing.rst
@@ -0,0 +1,224 @@
+Model sharing and uploading
+===========================
+
+In this page, we will show you how to share a model you have trained or fine-tuned on new data with the community on
+the `model hub <https://huggingface.co/models>`__.
+
+.. note::
+
+    You will need to create an account on `huggingface.co <https://huggingface.co/join>`__ for this.
+
+    Optionally, you can join an existing organization or create a new one.
+
+Prepare your model for uploading
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We have seen in the :doc:`training tutorial <training>`: how to fine-tune a model on a given task. You have probably
+done something similar on your task, either using the model directly in your own training loop or using the
+:class:`~.transformers.Trainer`/:class:`~.transformers.TFTrainer` class. Let's see how you can share the result on
+the `model hub <https://huggingface.co/models>`__.
+
+Basic steps
+^^^^^^^^^^^
+
+.. 
+    When #5258 is merged, we can remove the need to create the directory.
+
+First, pick a directory with the name you want your model to have on the model hub (its full name will then be
+`username/awesome-name-you-picked` or `organization/awesome-name-you-picked`) and create it with either
+
+::
+
+    mkdir path/to/awesome-name-you-picked
+
+or in python
+
+::
+
+    import os
+    os.makedirs("path/to/awesome-name-you-picked")
+
+then you can save your model and tokenizer with:
+
+::
+
+    model.save_pretrained("path/to/awesome-name-you-picked")
+    tokenizer.save_pretrained("path/to/awesome-name-you-picked")
+
+Or, if you're using the Trainer API
+
+::
+
+    trainer.save_model("path/to/awesome-name-you-picked")
+    tokenizer.save_pretrained("path/to/awesome-name-you-picked")
+
+Make your model work on all frameworks
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. 
+    TODO Sylvain: make this automatic during the upload
+
+You probably have your favorite framework, but so will other users! That's why it's best to upload your model with both
+PyTorch `and` TensorFlow checkpoints to make it easier to use (if you skip this step, users will still be able to load
+your model in another framework, but it will be slower, as it will have to be converted on the fly). Don't worry, it's super easy to do (and in a future version,
+it will all be automatic). You will need to install both PyTorch and TensorFlow for this step, but you don't need to
+worry about the GPU, so it should be very easy. Check the
+`TensorFlow installation page <https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available>`__ 
+and/or the `PyTorch installation page <https://pytorch.org/get-started/locally/#start-locally>`__ to see how.
+
+First check that your model class exists in the other framework, that is try to import the same model by either adding
+or removing TF. For instance, if you trained a :class:`~transformers.DistilBertForSequenceClassification`, try to
+type
+
+::
+
+    from transformers import TFDistilBertForSequenceClassification
+
+and if you trained a :class:`~transformers.TFDistilBertForSequenceClassification`, try to
+type
+
+::
+
+    from transformers import DistilBertForSequenceClassification
+
+This will give back an error if your model does not exist in the other framework (something that should be pretty rare
+since we're aiming for full parity between the two frameworks). In this case, skip this and go to the next step.
+
+Now, if you trained your model in PyTorch and have to create a TensorFlow version, adapt the following code to your
+model class:
+
+::
+
+    tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
+    tf_model.save_pretrained("path/to/awesome-name-you-picked")
+
+and if you trained your model in TensorFlow and have to create a PyTorch version, adapt the following code to your
+model class:
+
+::
+
+    pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
+    pt_model.save_pretrained("path/to/awesome-name-you-picked")
+
+That's all there is to it!
+
+Check the directory before uploading
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Make sure there are no garbage files in the directory you'll upload. It should only have:
+
+- a `config.json` file, which saves the :doc:`configuration <main_classes/configuration>` of your model ;
+- a `pytorch_model.bin` file, which is the PyTorch checkpoint (unless you can't have it for some reason) ;
+- a `tf_model.h5` file, which is the TensorFlow checkpoint (unless you can't have it for some reason) ;
+- a `special_tokens_map.json`, which is part of your :doc:`tokenizer <main_classes/tokenizer>` save;
+- a `tokenizer_config.json`, which is part of your :doc:`tokenizer <main_classes/tokenizer>` save;
+- a `vocab.txt`, which is the vocabulary of your tokenizer, part of your :doc:`tokenizer <main_classes/tokenizer>`
+  save;
+- maybe a `added_tokens.json`, which is part of your :doc:`tokenizer <main_classes/tokenizer>` save.
+
+Other files can safely be deleted.
+
+Upload your model with the CLI
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Now go in a terminal and run the following command. It should be in the virtual enviromnent where you installed 🤗
+Transformers, since that command :obj:`transformers-cli` comes from the library.
+
+::
+
+    transformers-cli login
+
+Then log in using the same credentials as on huggingface.co. To upload your model, just type
+
+::
+
+    transformers-cli upload path/to/awesome-name-you-picked/
+
+This will upload the folder containing the weights, tokenizer and configuration we prepared in the previous section.
+
+By default you will be prompted to confirm that you want these files to be uploaded. If you are uploading multiple models and need to script that process, you can add `-y` to bypass the prompt. For example:
+
+::
+
+    transformers-cli upload -y path/to/awesome-name-you-picked/
+
+
+If you want to upload a single file (a new version of your model, or the other framework checkpoint you want to add),
+just type:
+
+::
+
+    transformers-cli upload path/to/awesome-name-you-picked/that-file 
+
+or
+
+::
+
+   transformers-cli upload path/to/awesome-name-you-picked/that-file --filename awesome-name-you-picked/new_name
+
+if you want to change its filename.
+
+This uploads the model to your personal account. If you want your model to be namespaced by your organization name
+rather than your username, add the following flag to any command:
+
+::
+
+    --organization organization_name
+
+so for instance:
+
+::
+
+    transformers-cli upload path/to/awesome-name-you-picked/ --organization organization_name
+
+Your model will then be accessible through its identifier, which is, as we saw above,
+`username/awesome-name-you-picked` or `organization/awesome-name-you-picked`.
+
+Add a model card
+^^^^^^^^^^^^^^^^
+
+To make sure everyone knows what your model can do, what its limitations and potential bias or ethetical
+considerations, please add a README.md model card to the 🤗 Transformers repo under `model_cards/`. It should then be
+placed in a subfolder with your username or organization, then another subfolder named like your model
+(`awesome-name-you-picked`). Or just click on the "Create a model card on GitHub" button on the model page, it will
+get you directly to the right location. If you need one, `here <https://github.com/huggingface/model_card>`__ is a
+model card template (meta-suggestions are welcome).
+
+If your model is fine-tuned from another model coming from the model hub (all 🤗 Transformers pretrained models do),
+don't forget to link to its model card so that people can fully trace how your model was built.
+
+If you have never made a pull request to the 🤗 Transformers repo, look at the
+:doc:`contributing guide <contributing>` to see the steps to follow.
+
+.. Note::
+
+    You can also send your model card in the folder you uploaded with the CLI by placing it in a `README.md` file
+    inside `path/to/awesome-name-you-picked/`.
+
+Using your model
+^^^^^^^^^^^^^^^^
+
+Your model now has a page on huggingface.co/models 🔥
+
+Anyone can load it from code:
+
+::
+
+    tokenizer = AutoTokenizer.from_pretrained("namespace/awesome-name-you-picked")
+    model = AutoModel.from_pretrained("namespace/awesome-name-you-picked")
+
+Additional commands
+^^^^^^^^^^^^^^^^^^^
+
+You can list all the files you uploaded on the hub like this:
+
+::
+
+    transformers-cli s3 ls
+
+You can also delete unneeded files with
+
+::
+
+    transformers-cli s3 rm awesome-name-you-picked/filename
+
--- a/docs/source/model_summary.rst
+++ b/docs/source/model_summary.rst
@@ -0,0 +1,729 @@
+Summary of the models
+================================================
+
+This is a summary of the models available in 🤗 Transformers. It assumes you’re familiar with the original
+`transformer model <https://arxiv.org/abs/1706.03762>`_. For a gentle introduction check the `annotated transformer
+<http://nlp.seas.harvard.edu/2018/04/03/attention.html>`_. Here we focus on the high-level differences between the
+models. You can check them more in detail in their respective documentation. Also checkout the
+:doc:`pretrained model page </pretrained_models>` to see the checkpoints available for each type of model and all `the
+community models <https://huggingface.co/models>`_.
+
+Each one of the models in the library falls into one of the following categories:
+
+  * :ref:`autoregressive-models`
+  * :ref:`autoencoding-models`
+  * :ref:`seq-to-seq-models`
+  * :ref:`multimodal-models`
+  * :ref:`retrieval-based-models`
+
+Autoregressive models are pretrained on the classic language modeling task: guess the next token having read all the
+previous ones. They correspond to the decoder of the original transformer model, and a mask is used on top of the full
+sentence so that the attention heads can only see what was before in the next, and not what’s after. Although those
+models can be fine-tuned and achieve great results on many tasks, the most natural application is text generation.
+A typical example of such models is GPT.
+
+Autoencoding models are pretrained by corrupting the input tokens in some way and trying to reconstruct the original
+sentence. They correspond to the encoder of the original transformer model in the sense that they get access to the
+full inputs without any mask. Those models usually build a bidirectional representation of the whole sentence. They can
+be fine-tuned and achieve great results on many tasks such as text generation, but their most natural application is
+sentence classification or token classification. A typical example of such models is BERT.
+
+Note that the only difference between autoregressive models and autoencoding models is in the way the model is
+pretrained. Therefore, the same architecture can be used for both autoregressive and autoencoding models. When a given
+model has been used for both types of pretraining, we have put it in the category corresponding to the article where it was first
+introduced.
+
+Sequence-to-sequence models use both the encoder and the decoder of the original transformer, either for translation
+tasks or by transforming other tasks to sequence-to-sequence problems. They can be fine-tuned to many tasks but their
+most natural applications are translation, summarization and question answering. The original transformer model is an
+example of such a model (only for translation), T5 is an example that can be fine-tuned on other tasks.
+
+Multimodal models mix text inputs with other kinds (e.g. images) and are more specific to a given task.
+
+.. _autoregressive-models:
+
+Autoregressive models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+As mentioned before, these models rely on the decoder part of the original transformer and use an attention mask so
+that at each position, the model can only look at the tokens before the attention heads.
+
+Original GPT
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=openai-gpt">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-openai--gpt-blueviolet">
+   </a>
+   <a href="model_doc/gpt.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-openai--gpt-blueviolet">
+   </a>
+
+`Improving Language Understanding by Generative Pre-Training <https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf>`_,
+Alec Radford et al.
+
+The first autoregressive model based on the transformer architecture, pretrained on the Book Corpus dataset.
+
+The library provides versions of the model for language modeling and multitask language modeling/multiple choice
+classification.
+
+GPT-2
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=gpt2">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-gpt2-blueviolet">
+   </a>
+   <a href="model_doc/gpt2.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-gpt2-blueviolet">
+   </a>
+
+`Language Models are Unsupervised Multitask Learners <https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_,
+Alec Radford et al.
+
+A bigger and better version of GPT, pretrained on WebText (web pages from outgoing links in Reddit with 3 karmas or
+more).
+
+The library provides versions of the model for language modeling and multitask language modeling/multiple choice
+classification.
+
+CTRL
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=ctrl">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-ctrl-blueviolet">
+   </a>
+   <a href="model_doc/ctrl.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-ctrl-blueviolet">
+   </a>
+
+`CTRL: A Conditional Transformer Language Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`_,
+Nitish Shirish Keskar et al.
+
+Same as the GPT model but adds the idea of control codes. Text is generated from a prompt (can be empty) and one (or
+several) of those control codes which are then used to influence the text generation: generate with the style of
+wikipedia article, a book or a movie review.
+
+The library provides a version of the model for language modeling only.
+
+Transformer-XL
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=transfo-xl">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-transfo--xl-blueviolet">
+   </a>
+   <a href="model_doc/transformerxl.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-transfo--xl-blueviolet">
+   </a>
+
+`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_,
+Zihang Dai et al.
+
+Same as a regular GPT model, but introduces a recurrence mechanism for two consecutive segments (similar to a regular
+RNNs with two consecutive inputs). In this context, a segment is a number of consecutive tokens (for instance 512) that
+may span across multiple documents, and segments are fed in order to the model.
+
+Basically, the hidden states of the previous segment are concatenated to the current input to compute the attention
+scores. This allows the model to pay attention to information that was in the previous segment as well as the current
+one. By stacking multiple attention layers, the receptive field can be increased to multiple previous segments.
+
+This changes the positional embeddings to positional relative embeddings (as the regular positional embeddings would
+give the same results in the current input and the current hidden state at a given position) and needs to make some
+adjustments in the way attention scores are computed.
+
+The library provides a version of the model for language modeling only.
+
+.. _reformer:
+
+Reformer
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=reformer">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-reformer-blueviolet">
+   </a>
+   <a href="model_doc/reformer.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-reformer-blueviolet">
+   </a>
+
+`Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_,
+Nikita Kitaev et al .
+
+An autoregressive transformer model with lots of tricks to reduce memory footprint and compute time. Those tricks
+include:
+
+  * Use :ref:`Axial position encoding <axial-pos-encoding>` (see below for more details). It’s a mechanism to avoid
+    having a huge positional encoding matrix (when the sequence length is very big) by factorizing it into smaller
+    matrices.
+  * Replace traditional attention by :ref:`LSH (local-sensitive hashing) attention <lsh-attention>` (see below for more
+    details). It's a technique to avoid computing the full product query-key in the attention layers.
+  * Avoid storing the intermediate results of each layer by using reversible transformer layers to obtain them during
+    the backward pass (subtracting the residuals from the input of the next layer gives them back) or recomputing them
+    for results inside a given layer (less efficient than storing them but saves memory).
+  * Compute the feedforward operations by chunks and not on the whole batch.
+
+With those tricks, the model can be fed much larger sentences than traditional transformer autoregressive models.
+
+**Note:** This model could be very well be used in an autoencoding setting, there is no checkpoint for such a
+pretraining yet, though.
+
+The library provides a version of the model for language modeling only.
+
+XLNet
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=xlnet">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlnet-blueviolet">
+   </a>
+   <a href="model_doc/xlnet.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlnet-blueviolet">
+   </a>
+
+`XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_,
+Zhilin Yang et al.
+
+XLNet is not a traditional autoregressive model but uses a training strategy that builds on that. It permutes the
+tokens in the sentence, then allows the model to use the last n tokens to predict the token n+1. Since this is all done
+with a mask, the sentence is actually fed in the model in the right order, but instead of masking the first n tokens
+for n+1, XLNet uses a mask that hides the previous tokens in some given permutation of 1,...,sequence length.
+
+XLNet also uses the same recurrence mechanism as Transformer-XL to build long-term dependencies.
+
+The library provides a version of the model for language modeling, token classification, sentence classification,
+multiple choice classification and question answering.
+
+.. _autoencoding-models:
+
+Autoencoding models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+As mentioned before, these models rely on the encoder part of the original transformer and use no mask so the model can
+look at all the tokens in the attention heads. For pretraining, targets are the original sentences and inputs are their corrupted versions.
+
+BERT
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=bert">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-bert-blueviolet">
+   </a>
+   <a href="model_doc/bert.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-bert-blueviolet">
+   </a>
+
+`BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_,
+Jacob Devlin et al.
+
+Corrupts the inputs by using random masking, more precisely, during pretraining, a given percentage of tokens (usually
+15%) is masked by:
+
+  * a special mask token with probability 0.8
+  * a random token different from the one masked with probability 0.1
+  * the same token with probability 0.1
+
+The model must predict the original sentence, but has a second objective: inputs are two sentences A and B (with a
+separation token in between). With probability 50%, the sentences are consecutive in the corpus, in the remaining 50%
+they are not related. The model has to predict if the sentences are consecutive or not.
+
+The library provides a version of the model for language modeling (traditional or masked), next sentence prediction,
+token classification, sentence classification, multiple choice classification and question answering.
+
+ALBERT
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=albert">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-albert-blueviolet">
+   </a>
+   <a href="model_doc/albert.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-albert-blueviolet">
+   </a>
+
+`ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_,
+Zhenzhong Lan et al.
+
+Same as BERT but with a few tweaks:
+
+  * Embedding size E is different from hidden size H justified because the embeddings are context independent (one
+    embedding vector represents one token), whereas hidden states are context dependent (one hidden state represents a
+    sequence of tokens) so it's more logical to have H >> E. Also, the embedding matrix is large since it's V x E (V
+    being the vocab size). If E < H, it has less parameters.
+  * Layers are split in groups that share parameters (to save memory).
+  * Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and B
+    (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have
+    been swapped or not.
+
+The library provides a version of the model for masked language modeling, token classification, sentence
+classification, multiple choice classification and question answering.
+
+RoBERTa
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=roberta">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-roberta-blueviolet">
+   </a>
+   <a href="model_doc/roberta.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-roberta-blueviolet">
+   </a>
+
+`RoBERTa: A Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_,
+Yinhan Liu et al.
+
+Same as BERT with better pretraining tricks:
+
+  * dynamic masking: tokens are masked differently at each epoch, whereas BERT does it once and for all
+  * no NSP (next sentence prediction) loss and instead of putting just two sentences together, put a chunk of
+    contiguous texts together to reach 512 tokens (so the sentences are in an order than may span several documents)
+  * train with larger batches
+  * use BPE with bytes as a subunit and not characters (because of unicode characters)
+
+The library provides a version of the model for masked language modeling, token classification, sentence
+classification, multiple choice classification and question answering.
+
+DistilBERT
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=distilbert">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-distilbert-blueviolet">
+   </a>
+   <a href="model_doc/distilbert.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-distilbert-blueviolet">
+   </a>
+
+`DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_,
+Victor Sanh et al.
+
+Same as BERT but smaller. Trained by distillation of the pretrained BERT model, meaning it's been trained to predict
+the same probabilities as the larger model. The actual objective is a combination of:
+
+  * finding the same probabilities as the teacher model
+  * predicting the masked tokens correctly (but no next-sentence objective)
+  * a cosine similarity between the hidden states of the student and the teacher model
+
+The library provides a version of the model for masked language modeling, token classification, sentence classification
+and question answering.
+
+XLM
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=xlm">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlm-blueviolet">
+   </a>
+   <a href="model_doc/xlm.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlm-blueviolet">
+   </a>
+
+`Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_, Guillaume Lample and Alexis Conneau
+
+A transformer model trained on several languages. There are three different type of training for this model and the
+library provides checkpoints for all of them:
+
+  * Causal language modeling (CLM) which is the traditional autoregressive training (so this model could be in the
+    previous section as well). One of the languages is selected for each training sample, and the model input is a
+    sentence of 256 tokens, that may span over several documents in one of those languages.
+  * Masked language modeling (MLM) which is like RoBERTa. One of the languages is selected for each training sample,
+    and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages, with
+    dynamic masking of the tokens.
+  * A combination of MLM and translation language modeling (TLM). This consists of concatenating a sentence in two
+    different languages, with random masking. To predict one of the masked tokens, the model can use both, the
+    surrounding context in language 1 and the context given by language 2.
+
+Checkpoints refer to which method was used for pretraining by having `clm`, `mlm` or `mlm-tlm` in their names. On top
+of positional embeddings, the model has language embeddings. When training using MLM/CLM, this gives the model an
+indication of the language used, and when training using MLM+TLM, an indication of the language used for each part.
+
+The library provides a version of the model for language modeling, token classification, sentence classification and
+question answering.
+
+XLM-RoBERTa
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=xlm-roberta">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-xlm--roberta-blueviolet">
+   </a>
+   <a href="model_doc/xlmroberta.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlm--roberta-blueviolet">
+   </a>
+
+`Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_, Alexis Conneau et
+al.
+
+Uses RoBERTa tricks on the XLM approach, but does not use the translation language modeling objective. It only uses
+masked language modeling on sentences coming from one language. However, the model is trained on many more languages
+(100) and doesn't use the language embeddings, so it's capable of detecting the input language by itself.
+
+The library provides a version of the model for masked language modeling, token classification, sentence
+classification, multiple choice classification and question answering.
+
+FlauBERT
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=flaubert">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-flaubert-blueviolet">
+   </a>
+   <a href="model_doc/flaubert.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-flaubert-blueviolet">
+   </a>
+
+`FlauBERT: Unsupervised Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_, Hang Le et al.
+
+Like RoBERTa, without the sentence ordering prediction (so just trained on the MLM objective).
+
+The library provides a version of the model for language modeling and sentence classification.
+
+ELECTRA
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=electra">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-electra-blueviolet">
+   </a>
+   <a href="model_doc/electra.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-electra-blueviolet">
+   </a>
+
+`ELECTRA: Pre-training Text Encoders as Discriminators Rather Than Generators <https://arxiv.org/abs/2003.10555>`_,
+Kevin Clark et al.
+
+ELECTRA is a transformer model pretrained with the use of another (small) masked language model. The inputs are
+corrupted by that language model, which takes an input text that is randomly masked and outputs a text in which ELECTRA
+has to predict which token is an original and which one has been replaced. Like for GAN training, the small language
+model is trained for a few steps (but with the original texts as objective, not to fool the ELECTRA model like in a
+traditional GAN setting) then the ELECTRA model is trained for a few steps.
+
+The library provides a version of the model for masked language modeling, token classification and sentence
+classification.
+
+Funnel Transformer
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=funnel">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-funnel-blueviolet">
+   </a>
+   <a href="model_doc/funnel.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-funnel-blueviolet">
+   </a>
+
+`Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing
+<https://arxiv.org/abs/2006.03236>`_, Zihang Dai et al.
+
+Funnel Transformer is a transformer model using pooling, a bit like a ResNet model: layers are grouped in blocks, and
+at the beginning of each block (except the first one), the hidden states are pooled among the sequence dimension. This
+way, their length is divided by 2, which speeds up the computation of the next hidden states. All pretrained models
+have three blocks, which means the final hidden state has a sequence length that is one fourth of the original sequence
+length.
+
+For tasks such as classification, this is not a problem, but for tasks like masked language modeling or token
+classification, we need a hidden state with the same sequence length as the original input. In those cases, the final
+hidden states are upsampled to the input sequence length and go through two additional layers. That's why there are two
+versions of each checkpoint. The version suffixed with "-base" contains only the three blocks, while the version
+without that suffix contains the three blocks and the upsampling head with its additional layers.
+
+The pretrained models available use the same pretraining objective as ELECTRA.
+
+The library provides a version of the model for masked language modeling, token classification, sentence
+classification, multiple choice classification and question answering.
+
+.. _longformer:
+
+Longformer
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=longformer">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-longformer-blueviolet">
+   </a>
+   <a href="model_doc/longformer.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-longformer-blueviolet">
+   </a>
+
+`Longformer: The Long-Document Transformer <https://arxiv.org/abs/2004.05150>`_, Iz Beltagy et al.
+
+A transformer model replacing the attention matrices by sparse matrices to go faster. Often, the local context (e.g.,
+what are the two tokens left and right?) is enough to take action for a given token. Some preselected input tokens are
+still given global attention, but the attention matrix has way less parameters, resulting in a speed-up. See the
+:ref:`local attention section <local-attention>` for more information.
+
+It is pretrained the same way a RoBERTa otherwise.
+
+**Note:** This model could be very well be used in an autoregressive setting, there is no checkpoint for such a
+pretraining yet, though.
+
+The library provides a version of the model for masked language modeling, token classification, sentence
+classification, multiple choice classification and question answering.
+
+.. _seq-to-seq-models:
+
+Sequence-to-sequence models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+As mentioned before, these models keep both the encoder and the decoder of the original transformer.
+
+BART
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=bart">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-bart-blueviolet">
+   </a>
+   <a href="model_doc/bart.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-bart-blueviolet">
+   </a>
+
+`BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension
+<https://arxiv.org/abs/1910.13461>`_, Mike Lewis et al.
+
+Sequence-to-sequence model with an encoder and a decoder. Encoder is fed a corrupted version of the tokens, decoder is
+fed the original tokens (but has a mask to hide the future words like a regular transformers decoder). For the encoder, on the
+pretraining tasks, a composition of the following transformations are applied:
+
+  * mask random tokens (like in BERT)
+  * delete random tokens
+  * mask a span of k tokens with a single mask token (a span of 0 tokens is an insertion of a mask token)
+  * permute sentences
+  * rotate the document to make it start at a specific token
+
+The library provides a version of this model for conditional generation and sequence classification.
+
+Pegasus
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=pegasus">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-pegasus-blueviolet">
+   </a>
+   <a href="model_doc/pegasus.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-pegasus-blueviolet">
+   </a>
+
+`PEGASUS: Pre-training with Extracted Gap-sentences forAbstractive Summarization 
+<https://arxiv.org/pdf/1912.08777.pdf>`_, Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
+
+Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pre-training objective, called Gap Sentence Generation (GSG).
+
+  * MLM: encoder input tokens are randomely replaced by a mask tokens and have to be predicted by the encoder (like in BERT)
+  * GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a causal mask to hide the future words like a regular auto-regressive transformer decoder.
+
+In contrast to BART, Pegasus' pretraining task is intentionally similar to summarization: important sentences are masked and are generated together as one output sequence from the remaining sentences, similar to an extractive summary.
+
+The library provides a version of this model for conditional generation, which should be used for summarization.
+
+
+MarianMT
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=marian">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-marian-blueviolet">
+   </a>
+   <a href="model_doc/marian.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-marian-blueviolet">
+   </a>
+
+`Marian: Fast Neural Machine Translation in C++ <https://arxiv.org/abs/1804.00344>`_, Marcin Junczys-Dowmunt et al.
+
+A framework for translation models, using the same models as BART
+
+The library provides a version of this model for conditional generation.
+
+T5
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=t5">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-t5-blueviolet">
+   </a>
+   <a href="model_doc/t5.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-t5-blueviolet">
+   </a>
+
+`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`_,
+Colin Raffel et al.
+
+Uses the traditional transformer model (with a slight change in the positional embeddings, which are learned at
+each layer). To be able to operate on all NLP tasks, it transforms them into text-to-text problems by using specific
+prefixes: “summarize: ”, “question: ”, “translate English to German: ” and so forth.
+
+The pretraining includes both supervised and self-supervised training. Supervised training is conducted on downstream
+tasks provided by the GLUE and SuperGLUE benchmarks (converting them into text-to-text tasks as explained above).
+
+Self-supervised training uses corrupted tokens, by randomly removing 15% of the tokens and
+replacing them with individual sentinel tokens (if several consecutive tokens are marked for removal, the whole group is replaced with a single sentinel token). The input of the encoder is the corrupted sentence, the input of the decoder is the
+original sentence and the target is then the dropped out tokens delimited by their sentinel tokens.
+
+For instance, if we have the sentence “My dog is very cute .”, and we decide to remove the tokens: "dog", "is" and "cute", the encoder
+input becomes “My <x> very <y> .” and the target input becomes “<x> dog is <y> cute .<z>”
+
+The library provides a version of this model for conditional generation.
+
+MBart
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=mbart">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-mbart-blueviolet">
+   </a>
+   <a href="model_doc/mbart.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-mbart-blueviolet">
+   </a>
+
+`Multilingual Denoising Pre-training for Neural Machine Translation <https://arxiv.org/abs/2001.08210>`_ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov
+Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+
+The model architecture and pre-training objective is same as BART, but MBart is trained on 25 languages 
+and is intended for supervised and unsupervised machine translation. MBart is one of the first methods 
+for pre-training a complete sequence-to-sequence model by denoising full texts in multiple languages,
+
+The library provides a version of this model for conditional generation.
+
+The `mbart-large-en-ro checkpoint <https://huggingface.co/facebook/mbart-large-en-ro>`_ can be used for english -> romanian translation.
+
+The `mbart-large-cc25 <https://huggingface.co/facebook/mbart-large-cc25>`_ checkpoint can be finetuned for other translation and summarization tasks, using code in ```examples/seq2seq/``` , but is not very useful without finetuning.
+
+.. _multimodal-models:
+
+Multimodal models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+There is one multimodal model in the library which has not been pretrained in the self-supervised fashion like the
+others.
+
+MMBT
+----------------------------------------------
+
+`Supervised Multimodal Bitransformers for Classifying Images and Text <https://arxiv.org/abs/1909.02950>`_, Douwe Kiela
+et al.
+
+A transformers model used in multimodal settings, combining a text and an image to make predictions. The transformer
+model takes as inputs the embeddings of the tokenized text and the final activations of a pretrained on images resnet
+(after the pooling layer) that goes through a linear layer (to go from number of features at the end of the
+resnet to the hidden state dimension of the transformer).
+
+The different inputs are concatenated, and on top of the positional embeddings, a segment embedding is added to let the
+model know which part of the input vector corresponds to the text and which to the image.
+
+The pretrained model only works for classification.
+
+..
+    More information in this :doc:`model documentation </model_doc/mmbt.html>`.
+    TODO: write this page
+
+.. _retrieval-based-models:
+
+Retrieval-based models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Some models use documents retrieval during (pre)training and inference for open-domain question answering, for example.
+
+
+DPR
+----------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=dpr">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-dpr-blueviolet">
+   </a>
+   <a href="model_doc/ctrl.dpr">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-dpr-blueviolet">
+   </a>
+
+`Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`_,
+Vladimir Karpukhin et al.
+
+Dense Passage Retrieval (DPR) - is a set of tools and models for state-of-the-art open-domain question-answering research.
+
+
+DPR consists in three models:
+
+  * Question encoder: encode questions as vectors
+  * Context encoder: encode contexts as vectors
+  * Reader: extract the answer of the questions inside retrieved contexts, along with a relevance score (high if the inferred span actually answers the question).
+
+DPR's pipeline (not implemented yet) uses a retrieval step to find the top k contexts given a certain question, and then it calls the reader with the question and the retrieved documents to get the answer.
+
+More technical aspects
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Full vs sparse attention
+----------------------------------------------
+
+Most transformer models use full attention in the sense that the attention matrix is square. It can be a big
+computational bottleneck when you have long texts. Longformer and reformer are models that try to be more efficient and
+use a sparse version of the attention matrix to speed up training.
+
+.. _lsh-attention:
+
+**LSH attention**
+
+:ref:`Reformer <reformer>` uses LSH attention. In the softmax(QK^t), only the biggest elements (in the softmax
+dimension) of the matrix QK^t are going to give useful contributions. So for each query q in Q, we can  consider only
+the keys k in K that are close to q. A hash function is used to determine if q and k are close. The attention mask is
+modified to mask the current token (except at the first position), because it will give a query and a key equal (so very
+similar to each other). Since the hash can be a bit random, several hash functions are used in practice (determined by
+a n_rounds parameter) and then are averaged together.
+
+.. _local-attention:
+
+**Local attention**
+
+:ref:`Longformer <longformer>` uses local attention: often, the local context (e.g., what are the two tokens to the left and
+right?) is enough to take action for a given token. Also, by stacking attention layers that have a small window, the
+last layer will have a receptive field of more than just the tokens in the window, allowing them to build a
+representation of the whole sentence.
+
+Some preselected input tokens are also given global attention: for those few tokens, the attention matrix can access
+all tokens and this process is symmetric: all other tokens have access to those specific tokens (on top of the ones in
+their local window). This is shown in Figure 2d of the paper, see below for a sample attention mask:
+
+.. image:: imgs/local_attention_mask.png
+   :scale: 50 %
+   :align: center
+
+Using those attention matrices with less parameters then allows the model to have inputs having a bigger sequence
+length.
+
+Other tricks
+----------------------------------------------
+
+.. _axial-pos-encoding:
+
+**Axial positional encodings**
+
+:ref:`Reformer <reformer>` uses axial positional encodings: in traditional transformer models, the positional encoding
+E is a matrix of size :math:`l` by :math:`d`, :math:`l` being the sequence length and :math:`d` the dimension of the
+hidden state. If you have very long texts, this matrix can be huge and take way too much space on the GPU. To alleviate that, axial positional encodings consist of factorizing that big matrix E in two smaller matrices E1 and
+E2, with dimensions :math:`l_{1} \times d_{1}` and :math:`l_{2} \times d_{2}`, such that :math:`l_{1} \times l_{2} = l`
+and :math:`d_{1} + d_{2} = d` (with the product for the lengths, this ends up being way smaller). The embedding for
+time step :math:`j` in E is obtained by concatenating the embeddings for timestep :math:`j \% l1` in E1 and
+:math:`j // l1` in E2.
--- a/docs/source/multilingual.rst
+++ b/docs/source/multilingual.rst
@@ -36,10 +36,11 @@ Here is an example using the ``xlm-clm-enfr-1024`` checkpoint (Causal language m

 .. code-block::

-    import torch
-    from transformers import XLMTokenizer, XLMWithLMHeadModel
+    >>> import torch
+    >>> from transformers import XLMTokenizer, XLMWithLMHeadModel

-    tokenizer = XLMTokenizer.from_pretrained("xlm-clm-1024-enfr")
+    >>> tokenizer = XLMTokenizer.from_pretrained("xlm-clm-enfr-1024")
+    >>> model = XLMWithLMHeadModel.from_pretrained("xlm-clm-enfr-1024")


 The different languages this model/tokenizer handles, as well as the ids of these languages are visible using the
@@ -47,16 +48,15 @@ The different languages this model/tokenizer handles, as well as the ids of thes

 .. code-block::

-    # Continuation of the previous script
-    print(tokenizer.lang2id)  # {'en': 0, 'fr': 1}
+    >>> print(tokenizer.lang2id)
+    {'en': 0, 'fr': 1}


 These ids should be used when passing a language parameter during a model pass. Let's define our inputs:

 .. code-block::

-    # Continuation of the previous script
-    input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
+    >>> input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1


 We should now define the language embedding by using the previously defined language id. We want to create a tensor
@@ -64,20 +64,18 @@ filled with the appropriate language ids, of the same size as input_ids. For eng

 .. code-block::

-    # Continuation of the previous script
-    language_id = tokenizer.lang2id['en']  # 0
-    langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+    >>> language_id = tokenizer.lang2id['en']  # 0
+    >>> langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])

-    # We reshape it to be of size (batch_size, sequence_length)
-    langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
+    >>> # We reshape it to be of size (batch_size, sequence_length)
+    >>> langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)


 You can then feed it all as input to your model:

 .. code-block::

-    # Continuation of the previous script
-    outputs = model(input_ids, langs=langs)
+    >>> outputs = model(input_ids, langs=langs)


 The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py>`__
--- a/docs/source/perplexity.rst
+++ b/docs/source/perplexity.rst
@@ -0,0 +1,151 @@
+Perplexity of fixed-length models
+=================================
+
+Perplexity (PPL) is one of the most common metrics for evaluating language
+models. Before diving in, we should note that the metric applies specifically
+to classical language models (sometimes called autoregressive or causal
+language models) and is not well defined for masked language models like BERT
+(see :doc:`summary of the models <model_summary>`).
+
+Perplexity is defined as the exponentiated average log-likelihood of a
+sequence. If we have a tokenized sequence :math:`X = (x_0, x_1, \dots, x_t)`,
+then the perplexity of :math:`X` is,
+
+.. math::
+
+    \text{PPL}(X)
+    = \exp \left\{ {-\frac{1}{t}\sum_i^t \log p_\theta (x_i|x_{<i}) } \right\}
+
+where :math:`\log p_\theta (x_i|x_{<i})` is the log-likelihood of the ith
+token conditioned on the preceding tokens :math:`x_{<i}` according to our
+model. Intuitively, it can be thought of as an evaluation of the model's
+ability to predict uniformly among the set of specified tokens in a corpus.
+Importantly, this means that the tokenization procedure has a direct impact
+on a model's perplexity which should always be taken into consideration when
+comparing different models.
+
+This is also equivalent to the exponentiation of the cross-entropy between
+the data and model predictions. For more intuition about perplexity and its
+relationship to Bits Per Character (BPC) and data compression, check out this
+`fantastic blog post on The Gradient
+<https://thegradient.pub/understanding-evaluation-metrics-for-language-models/>`_.
+
+Calculating PPL with fixed-length models
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If we weren't limited by a model's context size, we would evaluate the
+model's perplexity by autoregressively factorizing a sequence and
+conditioning on the entire preceding subsequence at each step, as shown
+below.
+
+.. image:: imgs/ppl_full.gif
+    :width: 600
+    :alt: Full decomposition of a sequence with unlimited context length
+
+When working with approximate models, however, we typically have a constraint
+on the number of tokens the model can process. The largest version
+of :doc:`GPT-2 <model_doc/gpt2>`, for example, has a fixed length of 1024
+tokens, so we cannot calculate :math:`p_\theta(x_t|x_{<t})` directly when
+:math:`t` is greater than 1024.
+
+Instead, the sequence is typically broken into subsequences equal to the
+model's maximum input size. If a model's max input size is :math:`k`, we
+then approximate the likelihood of a token :math:`x_t` by conditioning only
+on the :math:`k-1` tokens that precede it rather than the entire context.
+When evaluating the model's perplexity of a sequence, a tempting but
+suboptimal approach is to break the sequence into disjoint chunks and
+add up the decomposed log-likelihoods of each segment independently.
+
+.. image:: imgs/ppl_chunked.gif
+    :width: 600
+    :alt: Suboptimal PPL not taking advantage of full available context
+
+This is quick to compute since the perplexity of each segment can be computed
+in one forward pass, but serves as a poor approximation of the
+fully-factorized perplexity and will typically yield a higher (worse) PPL
+because the model will have less context at most of the prediction steps.
+
+Instead, the PPL of fixed-length models should be evaluated with a
+sliding-window strategy. This involves repeatedly sliding the
+context window so that the model has more context when making each
+prediction.
+
+.. image:: imgs/ppl_sliding.gif
+    :width: 600
+    :alt: Sliding window PPL taking advantage of all available context
+
+This is a closer approximation to the true decomposition of the
+sequence probability and will typically yield a more favorable score.
+The downside is that it requires a separate forward pass for each token in
+the corpus. A good practical compromise is to employ a strided sliding
+window, moving the context by larger strides rather than sliding by 1 token a
+time. This allows computation to procede much faster while still giving the
+model a large context to make predictions at each step.
+
+Example: Calculating perplexity with GPT-2 in 🤗 Transformers
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Let's demonstrate this process with GPT-2.
+
+.. code-block:: python
+
+    from transformers import GPT2LMHeadModel, GPT2TokenizerFast
+    device = 'cuda'
+    model_id = 'gpt2-large'
+    model = GPT2LMHeadModel.from_pretrained(model_id).to(device)
+    tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
+
+We'll load in the WikiText-2 dataset and evaluate the perplexity using a few
+different sliding-window strategies. Since this dataset is small and we're
+just doing one forward pass over the set, we can just load and encode the
+entire dataset in memory.
+
+.. code-block:: python
+
+    from nlp import load_dataset
+    test = load_dataset('wikitext', 'wikitext-2-raw-v1', split='test')
+    encodings = tokenizer('\n\n'.join(test['text']), return_tensors='pt')
+
+With 🤗 Transformers, we can simply pass the ``input_ids`` as the ``labels``
+to our model, and the average log-likelihood for each token is returned as
+the loss. With our sliding window approach, however, there is overlap in the
+tokens we pass to the model at each iteration. We don't want the
+log-likelihood for the tokens we're just treating as context to be included
+in our loss, so we can set these targets to ``-100`` so that they are
+ignored. The following is an example of how we could do this with a stride of
+``512``. This means that the model will have at least 512 tokens for context
+when calculating the conditional likelihood of any one token (provided there
+are 512 preceding tokens available to condition on).
+
+.. code-block:: python
+
+    max_length = model.config.n_positions
+    stride = 512
+
+    lls = []
+    for i in tqdm(range(0, encodings.input_ids.size(1), stride)):
+        begin_loc = max(i + stride - max_length, 0)
+        end_loc = i + stride
+        input_ids = encodings.input_ids[:,begin_loc:end_loc].to(device)
+        target_ids = input_ids.clone()
+        target_ids[:,:-stride] = -100
+
+        with torch.no_grad():
+            outputs = model(input_ids, labels=target_ids)
+            log_likelihood = outputs[0] * stride
+
+        lls.append(log_likelihood)
+    
+    ppl = torch.exp(torch.stack(lls).sum() / i)
+
+Running this with the stride length equal to the max input length is
+equivalent to the suboptimal, non-sliding-window strategy we discussed above.
+The smaller the stride, the more context the model will have in making each
+prediction, and the better the reported perplexity will typically be.
+
+When we run the above with ``stride = 1024``, i.e. no overlap, the resulting
+PPL is ``19.64``, which is about the same as the ``19.93`` reported in the
+GPT-2 paper. By using ``stride = 512`` and thereby employing our striding
+window strategy, this jumps down to ``16.53``. This is not only a more
+favorable score, but is calculated in a way that is closer to the true
+autoregressive decomposition of a sequence likelihood.
--- a/docs/source/philosophy.rst
+++ b/docs/source/philosophy.rst
@@ -0,0 +1,73 @@
+Philosophy
+==========
+
+🤗 Transformers is an opinionated library built for:
+
+- NLP researchers and educators seeking to use/study/extend large-scale transformers models
+- hands-on practitioners who want to fine-tune those models and/or serve them in production
+- engineers who just want to download a pretrained model and use it to solve a given NLP task.
+
+The library was designed with two strong goals in mind:
+
+- Be as easy and fast to use as possible:
+
+    - We strongly limited the number of user-facing abstractions to learn, in fact, there are almost no abstractions,
+      just three standard classes required to use each model: :doc:`configuration <main_classes/configuration>`, 
+      :doc:`models <main_classes/model>` and :doc:`tokenizer <main_classes/tokenizer>`.
+    - All of these classes can be initialized in a simple and unified way from pretrained instances by using a common
+      :obj:`from_pretrained()` instantiation method which will take care of downloading (if needed), caching and
+      loading the related class instance and associated data (configurations' hyper-parameters, tokenizers' vocabulary, 
+      and models' weights) from a pretrained checkpoint provided on 
+      `Hugging Face Hub <https://huggingface.co/models>`__ or your own saved checkpoint.
+    - On top of those three base classes, the library provides two APIs: :func:`~transformers.pipeline` for quickly
+      using a model (plus its associated tokenizer and configuration) on a given task and 
+      :func:`~transformers.Trainer`/:func:`~transformers.TFTrainer` to quickly train or fine-tune a given model.
+    - As a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to
+      extend/build-upon the library, just use regular Python/PyTorch/TensorFlow/Keras modules and inherit from the base
+      classes of the library to reuse functionalities like model loading/saving.
+
+- Provide state-of-the-art models with performances as close as possible to the original models:
+
+    - We provide at least one example for each architecture which reproduces a result provided by the official authors
+      of said architecture.
+    - The code is usually as close to the original code base as possible which means some PyTorch code may be not as
+      *pytorchic* as it could be as a result of being converted TensorFlow code and vice versa.
+
+A few other goals:
+
+- Expose the models' internals as consistently as possible:
+
+    - We give access, using a single API, to the full hidden-states and attention weights.
+    - Tokenizer and base model's API are standardized to easily switch between models.
+
+- Incorporate a subjective selection of promising tools for fine-tuning/investigating these models:
+
+    - A simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning.
+    - Simple ways to mask and prune transformer heads.
+
+- Switch easily between PyTorch and TensorFlow 2.0, allowing training using one framework and inference using another.
+
+Main concepts
+~~~~~~~~~~~~~
+
+The library is built around three types of classes for each model:
+
+- **Model classes**  such as :class:`~transformers.BertModel`, which are 30+ PyTorch models 
+  (`torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__) or Keras models 
+  (`tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__) that work with the pretrained
+  weights provided in the library.
+- **Configuration classes** such as :class:`~transformers.BertConfig`, which store all the parameters required to build
+  a model. You don't always need to instantiate these yourself. In particular, if you are using a pretrained model
+  without any modification, creating the model will automatically take care of instantiating the configuration (which
+  is part of the model).
+- **Tokenizer classes** such as :class:`~transformers.BertTokenizer`, which store the vocabulary for each model and
+  provide methods for encoding/decoding strings in a list of token embeddings indices to be fed to a model.
+
+All these classes can be instantiated from pretrained instances and saved locally using two methods:
+
+- :obj:`from_pretrained()` lets you instantiate a model/configuration/tokenizer from a pretrained version either
+  provided by the library itself (the suported models are provided in the list :doc:`here <pretrained_models>`
+  or stored locally (or on a server) by the user,
+- :obj:`save_pretrained()` lets you save a model/configuration/tokenizer locally so that it can be reloaded using
+  :obj:`from_pretrained()`.
+
--- a/docs/source/preprocessing.rst
+++ b/docs/source/preprocessing.rst
@@ -0,0 +1,343 @@
+Preprocessing data
+==================
+
+In this tutorial, we'll explore how to preprocess your data using 🤗 Transformers. The main tool for this is what we
+
+call a :doc:`tokenizer <main_classes/tokenizer>`. You can build one using the tokenizer class associated to the model
+you would like to use, or directly with the :class:`~transformers.AutoTokenizer` class.
+
+As we saw in the :doc:`quicktour </quicktour>`, the tokenizer will first split a given text in words (or part of words,
+punctuation symbols, etc.) usually called `tokens`. Then it will convert those `tokens` into numbers, to be able to
+build a tensor out of them and feed them to the model. It will also add any additional inputs the model might expect to
+work properly.
+
+.. note::
+
+    If you plan on using a pretrained model, it's important to use the associated pretrained tokenizer: it will split
+    the text you give it in tokens the same way for the pretraining corpus, and it will use the same correspondence
+    token to index (that we usually call a `vocab`) as during pretraining.
+
+To automatically download the vocab used during pretraining or fine-tuning a given model, you can use the 
+:func:`~transformers.AutoTokenizer.from_pretrained` method:
+
+.. code-block::
+
+    from transformers import AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')
+
+Base use
+~~~~~~~~
+
+A :class:`~transformers.PreTrainedTokenizer` has many methods, but the only one you need to remember for preprocessing
+is its ``__call__``: you just need to feed your sentence to your tokenizer object.
+
+.. code-block::
+
+    >>> encoded_input = tokenizer("Hello, I'm a single sentence!")
+    >>> print(encoded_input)
+    {'input_ids': [101, 138, 18696, 155, 1942, 3190, 1144, 1572, 13745, 1104, 159, 9664, 2107, 102], 
+     'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 
+     'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+
+This returns a dictionary string to list of ints.
+The `input_ids <glossary.html#input-ids>`__ are the indices corresponding to each token in our sentence. We will see
+below what the `attention_mask <glossary.html#attention-mask>`__ is used for and in
+:ref:`the next section <sentence-pairs>` the goal of `token_type_ids <glossary.html#token-type-ids>`__.
+
+The tokenizer can decode a list of token ids in a proper sentence:
+
+.. code-block::
+
+    >>> tokenizer.decode(encoded_input["input_ids"])
+    "[CLS] Hello, I'm a single sentence! [SEP]"
+
+As you can see, the tokenizer automatically added some special tokens that the model expect. Not all model need special
+tokens; for instance, if we had used` gtp2-medium` instead of `bert-base-cased` to create our tokenizer, we would have
+seen the same sentence as the original one here. You can disable this behavior (which is only advised if you have added
+those special tokens yourself) by passing ``add_special_tokens=False``.
+
+If you have several sentences you want to process, you can do this efficiently by sending them as a list to the
+tokenizer:
+
+.. code-block::
+
+    >>> batch_sentences = ["Hello I'm a single sentence",
+    ...                    "And another sentence",
+    ...                    "And the very very last one"]
+    >>> encoded_inputs = tokenizer(batch_sentences)
+    >>> print(encoded_inputs)
+    {'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102],
+                   [101, 1262, 1330, 5650, 102],
+                   [101, 1262, 1103, 1304, 1304, 1314, 1141, 102]],
+     'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0],
+                        [0, 0, 0, 0, 0],
+                        [0, 0, 0, 0, 0, 0, 0, 0]],
+     'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1],
+                        [1, 1, 1, 1, 1],
+                        [1, 1, 1, 1, 1, 1, 1, 1]]}
+
+We get back a dictionary once again, this time with values being list of list of ints.
+
+If the purpose of sending several sentences at a time to the tokenizer is to build a batch to feed the model, you will
+probably want:
+
+- To pad each sentence to the maximum length there is in your batch.
+- To truncate each sentence to the maximum length the model can accept (if applicable).
+- To return tensors.
+
+You can do all of this by using the following options when feeding your list of sentences to the tokenizer:
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> batch = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt")
+    >>> print(batch)
+    {'input_ids': tensor([[ 101, 8667,  146,  112,  182,  170, 1423, 5650,  102],
+                          [ 101, 1262, 1330, 5650,  102,    0,    0,    0,    0],
+                          [ 101, 1262, 1103, 1304, 1304, 1314, 1141,  102,    0]]),
+     'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
+                               [0, 0, 0, 0, 0, 0, 0, 0, 0],
+                               [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+     'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
+                               [1, 1, 1, 1, 1, 0, 0, 0, 0],
+                               [1, 1, 1, 1, 1, 1, 1, 1, 0]])}
+    >>> ## TENSORFLOW CODE
+    >>> batch = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="tf")
+    >>> print(batch)
+    {'input_ids': tf.Tensor([[ 101, 8667,  146,  112,  182,  170, 1423, 5650,  102],
+                          [ 101, 1262, 1330, 5650,  102,    0,    0,    0,    0],
+                          [ 101, 1262, 1103, 1304, 1304, 1314, 1141,  102,    0]]),
+     'token_type_ids': tf.Tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0],
+                               [0, 0, 0, 0, 0, 0, 0, 0, 0],
+                               [0, 0, 0, 0, 0, 0, 0, 0, 0]]), 
+     'attention_mask': tf.Tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
+                               [1, 1, 1, 1, 1, 0, 0, 0, 0],
+                               [1, 1, 1, 1, 1, 1, 1, 1, 0]])}
+
+It returns a dictionary string to tensor. We can now see what the `attention_mask <glossary.html#attention-mask>`__ is
+all about: it points out which tokens the model should pay attention to and which ones it should not (because they
+represent padding in this case).
+
+
+Note that if your model does not have a maximum length associated to it, the command above will throw a warning. You
+can safely ignore it. You can also pass ``verbose=False`` to stop the tokenizer to throw those kinds of warnings.
+
+.. _sentence-pairs:
+
+Preprocessing pairs of sentences
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Sometimes you need to feed pair of sentences to your model. For instance, if you want to classify if two sentences in a
+pair are similar, or for question-answering models, which take a context and a question. For BERT models, the input is
+then represented like this: :obj:`[CLS] Sequence A [SEP] Sequence B [SEP]`
+
+You can encode a pair of sentences in the format expected by your model by supplying the two sentences as two arguments
+(not a list since a list of two sentences will be interpreted as a batch of two single sentences, as we saw before).
+This will once again return a dict string to list of ints:
+
+.. code-block::
+
+    >>> encoded_input = tokenizer("How old are you?", "I'm 6 years old")
+    >>> print(encoded_input)
+    {'input_ids': [101, 1731, 1385, 1132, 1128, 136, 102, 146, 112, 182, 127, 1201, 1385, 102], 
+     'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1], 
+     'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+
+This shows us what the `token_type_ids <glossary.html#token-type-ids>`__ are for: they indicate to the model which part
+of the inputs correspond to the first sentence and which part corresponds to the second sentence. Note that
+`token_type_ids` are not required or handled by all models. By default, a tokenizer will only return the inputs that
+its associated model expects. You can force the return (or the non-return) of any of those special arguments by
+using ``return_input_ids`` or ``return_token_type_ids``.
+
+If we decode the token ids we obtained, we will see that the special tokens have been properly added.
+
+.. code-block::
+
+    >>> tokenizer.decode(encoded_input["input_ids"])
+    "[CLS] How old are you? [SEP] I'm 6 years old [SEP]"
+
+If you have a list of pairs of sequences you want to process, you should feed them as two lists to your tokenizer: the
+list of first sentences and the list of second sentences:
+
+.. code-block::
+
+    >>> batch_sentences = ["Hello I'm a single sentence",
+    ...                    "And another sentence",
+    ...                    "And the very very last one"]
+    >>> batch_of_second_sentences = ["I'm a sentence that goes with the first sentence",
+    ...                              "And I should be encoded with the second sentence",
+    ...                              "And I go with the very last one"]
+    >>> encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences)
+    >>> print(encoded_inputs)
+    {'input_ids': [[101, 8667, 146, 112, 182, 170, 1423, 5650, 102, 146, 112, 182, 170, 5650, 1115, 2947, 1114, 1103, 1148, 5650, 102], 
+                   [101, 1262, 1330, 5650, 102, 1262, 146, 1431, 1129, 12544, 1114, 1103, 1248, 5650, 102], 
+                   [101, 1262, 1103, 1304, 1304, 1314, 1141, 102, 1262, 146, 1301, 1114, 1103, 1304, 1314, 1141, 102]], 
+    'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                       [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                       [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 
+    'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 
+                       [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}
+
+As we can see, it returns a dictionary with the values being list of lists of ints.
+
+To double-check what is fed to the model, we can decode each list in `input_ids` one by one:
+
+.. code-block::
+
+    >>> for ids in encoded_inputs["input_ids"]:
+    >>>     print(tokenizer.decode(ids))
+    [CLS] Hello I'm a single sentence [SEP] I'm a sentence that goes with the first sentence [SEP]
+    [CLS] And another sentence [SEP] And I should be encoded with the second sentence [SEP]
+    [CLS] And the very very last one [SEP] And I go with the very last one [SEP]
+
+Once again, you can automatically pad your inputs to the maximum sentence length in the batch, truncate to the maximum
+length the model can accept and return tensors directly with the following:
+
+.. code-block::
+
+    ## PYTORCH CODE
+    batch = tokenizer(batch_sentences, batch_of_second_sentences, padding=True, truncation=True, return_tensors="pt")
+    ## TENSORFLOW CODE
+    batch = tokenizer(batch_sentences, batch_of_second_sentences, padding=True, truncation=True, return_tensors="tf")
+
+Everything you always wanted to know about padding and truncation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+We have seen the commands that will work for most cases (pad your batch to the length of the maximum sentence and
+
+truncate to the maximum length the mode can accept). However, the API supports more strategies if you need them. The
+three arguments you need to know for this are :obj:`padding`, :obj:`truncation` and :obj:`max_length`.
+
+- :obj:`padding` controls the padding. It can be a boolean or a string which should be:
+
+    - :obj:`True` or :obj:`'longest'` to pad to the longest sequence in the batch (doing no padding if you only provide
+      a single sequence).
+    - :obj:`'max_length'` to pad to a length specified by the :obj:`max_length` argument or the maximum length accepted
+      by the model if no :obj:`max_length` is provided (``max_length=None``). If you only provide a single sequence,
+      padding will still be applied to it. 
+    - :obj:`False` or :obj:`'do_not_pad'` to not pad the sequences. As we have seen before, this is the default
+      behavior.
+
+- :obj:`truncation` controls the truncation. It can be a boolean or a string which should be:
+
+    - :obj:`True` or :obj:`'only_first'` truncate to a maximum length specified by the :obj:`max_length` argument or
+      the maximum length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will
+      only truncate the first sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.
+    - :obj:`'only_second'` truncate to a maximum length specified by the :obj:`max_length` argument or the maximum
+      length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will only truncate
+      the second sentence of a pair if a pair of sequence (or a batch of pairs of sequences) is provided.
+    - :obj:`'longest_first'` truncate to a maximum length specified by the :obj:`max_length` argument or the maximum
+      length accepted by the model if no :obj:`max_length` is provided (``max_length=None``). This will truncate token
+      by token, removing a token from the longest sequence in the pair until the proper length is reached.
+    - :obj:`False` or :obj:`'do_not_truncate'` to not truncate the sequences. As we have seen before, this is the
+      default behavior.
+
+- :obj:`max_length` to control the length of the padding/truncation. It can be an integer or :obj:`None`, in which case
+  it will default to the maximum length the model can accept. If the model has no specific maximum input length,
+  truncation/padding to :obj:`max_length` is deactivated.
+
+Here is a table summarizing the recommend way to setup padding and truncation. If you use pair of inputs sequence in
+any of the following examples, you can replace :obj:`truncation=True` by a :obj:`STRATEGY` selected in 
+:obj:`['only_first', 'only_second', 'longest_first']`, i.e. :obj:`truncation='only_second'` or
+:obj:`truncation= 'longest_first'` to control how both sequence in the pair are truncated as detailed before.
+
+--------------------------------------+-----------------------------------+---------------------------------------------------------------------------------------------+
+| Truncation                           | Padding                           | Instruction                                                                                 |
+======================================+===================================+=============================================================================================+
+| no truncation                        | no padding                        | :obj:`tokenizer(batch_sentences)`                                                           |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to max sequence in batch  | :obj:`tokenizer(batch_sentences, padding=True)` or                                          |
+|                                      |                                   | :obj:`tokenizer(batch_sentences, padding='longest')`                                        |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to max model input length | :obj:`tokenizer(batch_sentences, padding='max_length')`                                     |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to specific length        | :obj:`tokenizer(batch_sentences, padding='max_length', max_length=42)`                      |
+--------------------------------------+-----------------------------------+---------------------------------------------------------------------------------------------+
+| truncation to max model input length | no padding                        | :obj:`tokenizer(batch_sentences, truncation=True)` or                                       |
+|                                      |                                   | :obj:`tokenizer(batch_sentences, truncation=STRATEGY)`                                      |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to max sequence in batch  | :obj:`tokenizer(batch_sentences, padding=True, truncation=True)` or                         |
+|                                      |                                   | :obj:`tokenizer(batch_sentences, padding=True, truncation=STRATEGY)`                        |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to max model input length | :obj:`tokenizer(batch_sentences, padding='max_length', truncation=True)` or                 |
+|                                      |                                   | :obj:`tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY)`                |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to specific length        | Not possible                                                                                |
+--------------------------------------+-----------------------------------+---------------------------------------------------------------------------------------------+
+| truncation to specific length        | no padding                        | :obj:`tokenizer(batch_sentences, truncation=True, max_length=42)` or                        |
+|                                      |                                   | :obj:`tokenizer(batch_sentences, truncation=STRATEGY, max_length=42)`                       |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to max sequence in batch  | :obj:`tokenizer(batch_sentences, padding=True, truncation=True, max_length=42)` or          |
+|                                      |                                   | :obj:`tokenizer(batch_sentences, padding=True, truncation=STRATEGY, max_length=42)`         |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to max model input length | Not possible                                                                                |
+|                                      +-----------------------------------+---------------------------------------------------------------------------------------------+
+|                                      | padding to specific length        | :obj:`tokenizer(batch_sentences, padding='max_length', truncation=True, max_length=42)` or  |
+|                                      |                                   | :obj:`tokenizer(batch_sentences, padding='max_length', truncation=STRATEGY, max_length=42)` |
+--------------------------------------+-----------------------------------+---------------------------------------------------------------------------------------------+
+
+Pre-tokenized inputs
+~~~~~~~~~~~~~~~~~~~~
+
+The tokenizer also accept pre-tokenized inputs. This is particularly useful when you want to compute labels and extract
+predictions in `named entity recognition (NER) <https://en.wikipedia.org/wiki/Named-entity_recognition>`__ or
+`part-of-speech tagging (POS tagging) <https://en.wikipedia.org/wiki/Part-of-speech_tagging>`__.
+
+.. warning::
+
+    Pre-tokenized does not mean your inputs are already tokenized (you wouldn't need to pass them though the tokenizer
+    if that was the case) but just split into words (which is often the first step in subword tokenization algorithms
+    like BPE).
+
+If you want to use pre-tokenized inputs, just set :obj:`is_split_into_words=True` when passing your inputs to the
+tokenizer. For instance, we have:
+
+.. code-block::
+
+    >>> encoded_input = tokenizer(["Hello", "I'm", "a", "single", "sentence"], is_split_into_words=True)
+    >>> print(encoded_input)
+    {'input_ids': [101, 8667, 146, 112, 182, 170, 1423, 5650, 102],
+     'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 
+     'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}
+
+Note that the tokenizer still adds the ids of special tokens (if applicable) unless you pass
+``add_special_tokens=False``.
+
+This works exactly as before for batch of sentences or batch of pairs of sentences. You can encode a batch of sentences
+like this:
+
+.. code-block::
+
+    batch_sentences = [["Hello", "I'm", "a", "single", "sentence"],
+                       ["And", "another", "sentence"],
+                       ["And", "the", "very", "very", "last", "one"]]
+    encoded_inputs = tokenizer(batch_sentences, is_split_into_words=True)
+
+or a batch of pair sentences like this:
+
+.. code-block::
+
+    batch_of_second_sentences = [["I'm", "a", "sentence", "that", "goes", "with", "the", "first", "sentence"],
+                                 ["And", "I", "should", "be", "encoded", "with", "the", "second", "sentence"],
+                                 ["And", "I", "go", "with", "the", "very", "last", "one"]]
+    encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences, is_split_into_words=True)
+
+And you can add padding, truncation as well as directly return tensors like before:
+
+.. code-block::
+
+    ## PYTORCH CODE
+    batch = tokenizer(batch_sentences,
+                      batch_of_second_sentences,
+                      is_split_into_words=True,
+                      padding=True,
+                      truncation=True,
+                      return_tensors="pt")
+    ## TENSORFLOW CODE
+    batch = tokenizer(batch_sentences,
+                      batch_of_second_sentences,
+                      is_split_into_words=True,
+                      padding=True,
+                      truncation=True,
+                      return_tensors="tf")
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -5,309 +5,414 @@ Here is the full list of the currently provided pretrained models together with

 For a list that includes community-uploaded models, refer to `https://huggingface.co/models <https://huggingface.co/models>`__.

-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Architecture      | Shortcut name                                              | Details of the model                                                                                                                  |
-+===================+============================================================+=======================================================================================================================================+
-| BERT              | ``bert-base-uncased``                                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on lower-cased English text.                                                                                                |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-uncased``                                     | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
-|                   |                                                            | | Trained on lower-cased English text.                                                                                                |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-cased``                                        | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on cased English text.                                                                                                      |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-cased``                                       | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
-|                   |                                                            | | Trained on cased English text.                                                                                                      |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-multilingual-uncased``                         | | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters.                                                        |
-|                   |                                                            | | Trained on lower-cased text in the top 102 languages with the largest Wikipedias                                                    |
-|                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__).                                              |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-multilingual-cased``                           | | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters.                                                             |
-|                   |                                                            | | Trained on cased text in the top 104 languages with the largest Wikipedias                                                          |
-|                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__).                                              |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-chinese``                                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on cased Chinese Simplified and Traditional text.                                                                           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-german-cased``                                 | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on cased German text by Deepset.ai                                                                                          |
-|                   |                                                            | (see `details on deepset.ai website <https://deepset.ai/german-bert>`__).                                                             |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-uncased-whole-word-masking``                  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
-|                   |                                                            | | Trained on lower-cased English text using Whole-Word-Masking                                                                        |
-|                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__).                                                                    |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-cased-whole-word-masking``                    | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
-|                   |                                                            | | Trained on cased English text using Whole-Word-Masking                                                                              |
-|                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__).                                                                    |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
-|                   |                                                            | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                             |
-|                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/transformers/tree/master/examples>`__).           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                                    |
-|                   |                                                            | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                               |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-cased-finetuned-mrpc``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | The ``bert-base-cased`` model fine-tuned on MRPC                                                                                    |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-german-dbmdz-cased``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on cased German text by DBMDZ                                                                                               |
-|                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-german-dbmdz-uncased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on uncased German text by DBMDZ                                                                                             |
-|                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``cl-tohoku/bert-base-japanese``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece.                                                               |
-|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``cl-tohoku/bert-base-japanese-whole-word-masking``        | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece.                                      |
-|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``cl-tohoku/bert-base-japanese-char``                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                                        |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``cl-tohoku/bert-base-japanese-char-whole-word-masking``   | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                               |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``TurkuNLP/bert-base-finnish-cased-v1``                    | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on cased Finnish text.                                                                                                      |
-|                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``TurkuNLP/bert-base-finnish-uncased-v1``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on uncased Finnish text.                                                                                                    |
-|                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``wietsedv/bert-base-dutch-cased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on cased Dutch text.                                                                                                        |
-|                   |                                                            | (see `details on wietsedv repository <https://github.com/wietsedv/bertje/>`__).                                                       |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | OpenAI GPT English model                                                                                                            |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| GPT-2             | ``gpt2``                                                   | | 12-layer, 768-hidden, 12-heads, 117M parameters.                                                                                    |
-|                   |                                                            | | OpenAI GPT-2 English model                                                                                                          |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``gpt2-medium``                                            | | 24-layer, 1024-hidden, 16-heads, 345M parameters.                                                                                   |
-|                   |                                                            | | OpenAI's Medium-sized GPT-2 English model                                                                                           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``gpt2-large``                                             | | 36-layer, 1280-hidden, 20-heads, 774M parameters.                                                                                   |
-|                   |                                                            | | OpenAI's Large-sized GPT-2 English model                                                                                            |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``gpt2-xl``                                                | | 48-layer, 1600-hidden, 25-heads, 1558M parameters.                                                                                  |
-|                   |                                                            | | OpenAI's XL-sized GPT-2 English model                                                                                               |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Transformer-XL    | ``transfo-xl-wt103``                                       | | 18-layer, 1024-hidden, 16-heads, 257M parameters.                                                                                   |
-|                   |                                                            | | English model trained on wikitext-103                                                                                               |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| XLNet             | ``xlnet-base-cased``                                       | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | XLNet English model                                                                                                                 |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlnet-large-cased``                                      | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
-|                   |                                                            | | XLNet Large English model                                                                                                           |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| XLM               | ``xlm-mlm-en-2048``                                        | | 12-layer, 2048-hidden, 16-heads                                                                                                     |
-|                   |                                                            | | XLM English model                                                                                                                   |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
-|                   |                                                            | | XLM English-German model trained on the concatenation of English and German wikipedia                                               |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
-|                   |                                                            | | XLM English-French model trained on the concatenation of English and French wikipedia                                               |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-enro-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
-|                   |                                                            | | XLM English-Romanian Multi-language model                                                                                           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-xnli15-1024``                                    | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
-|                   |                                                            | | XLM Model pre-trained with MLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                             |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-tlm-xnli15-1024``                                | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
-|                   |                                                            | | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                       |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-clm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
-|                   |                                                            | | XLM English-French model trained with CLM (Causal Language Modeling) on the concatenation of English and French wikipedia           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-clm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
-|                   |                                                            | | XLM English-German model trained with CLM (Causal Language Modeling) on the concatenation of English and German wikipedia           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-17-1280``                                        | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
-|                   |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 17 languages.                                                              |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-100-1280``                                       | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
-|                   |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 100 languages.                                                             |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| RoBERTa           | ``roberta-base``                                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
-|                   |                                                            | | RoBERTa using the BERT-base architecture                                                                                            |
-|                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``roberta-large``                                          | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
-|                   |                                                            | | RoBERTa using the BERT-large architecture                                                                                           |
-|                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``roberta-large-mnli``                                     | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
-|                   |                                                            | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__.                                            |
-|                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``distilroberta-base``                                     | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
-|                   |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``roberta-base-openai-detector``                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
-|                   |                                                            | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                             |
-|                   |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``roberta-large-openai-detector``                          | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
-|                   |                                                            | | ``roberta-large`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                            |
-|                   |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| DistilBERT        | ``distilbert-base-uncased``                                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
-|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint                                                   |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``distilbert-base-uncased-distilled-squad``                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
-|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``distilbert-base-cased``                                  | | 6-layer, 768-hidden, 12-heads, 65M parameters                                                                                       |
-|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint                                                     |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``distilbert-base-cased-distilled-squad``                  | | 6-layer, 768-hidden, 12-heads, 65M parameters                                                                                       |
-|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint, with an additional question answering layer.       |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``distilgpt2``                                             | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
-|                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``distilbert-base-german-cased``                           | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
-|                   |                                                            | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint.                   |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``distilbert-base-multilingual-cased``                     | | 6-layer, 768-hidden, 12-heads, 134M parameters                                                                                      |
-|                   |                                                            | | The multilingual DistilBERT model distilled from the Multilingual BERT model `bert-base-multilingual-cased` checkpoint.             |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| CTRL              | ``ctrl``                                                   | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters                                                                                    |
-|                   |                                                            | | Salesforce's Large-sized CTRL English model                                                                                         |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| CamemBERT         | ``camembert-base``                                         | | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                                     |
-|                   |                                                            | | CamemBERT using the BERT-base architecture                                                                                          |
-|                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/camembert>`__)                                                 |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| ALBERT            | ``albert-base-v1``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
-|                   |                                                            | | ALBERT base model                                                                                                                   |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``albert-large-v1``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
-|                   |                                                            | | ALBERT large model                                                                                                                  |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``albert-xlarge-v1``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
-|                   |                                                            | | ALBERT xlarge model                                                                                                                 |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``albert-xxlarge-v1``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
-|                   |                                                            | | ALBERT xxlarge model                                                                                                                |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``albert-base-v2``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
-|                   |                                                            | | ALBERT base model with no dropout, additional training data and longer training                                                     |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``albert-large-v2``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
-|                   |                                                            | | ALBERT large model with no dropout, additional training data and longer training                                                    |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``albert-xlarge-v2``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
-|                   |                                                            | | ALBERT xlarge model with no dropout, additional training data and longer training                                                   |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``albert-xxlarge-v2``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
-|                   |                                                            | | ALBERT xxlarge model with no dropout, additional training data and longer training                                                  |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| T5                | ``t5-small``                                               | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads,                                           |
-|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-base``                                                | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads,                                        |
-|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-large``                                               | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
-|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-3B``                                                  | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads,                                      |
-|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-11B``                                                 | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads,                                      |
-|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| XLM-RoBERTa       | ``xlm-roberta-base``                                       | | ~125M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 8-heads,                                         |
-|                   |                                                            | | Trained on on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                       |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-roberta-large``                                      | | ~355M parameters with 24-layers, 1027-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
-|                   |                                                            | | Trained on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                          |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| FlauBERT          | ``flaubert/flaubert_small_cased``                          | | 6-layer, 512-hidden, 8-heads, 54M parameters                                                                                        |
-|                   |                                                            | | FlauBERT small architecture                                                                                                         |
-|                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert/flaubert_base_uncased``                         | | 12-layer, 768-hidden, 12-heads, 137M parameters                                                                                     |
-|                   |                                                            | | FlauBERT base architecture with uncased vocabulary                                                                                  |
-|                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert/flaubert_base_cased``                           | | 12-layer, 768-hidden, 12-heads, 138M parameters                                                                                     |
-|                   |                                                            | | FlauBERT base architecture with cased vocabulary                                                                                    |
-|                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``flaubert/flaubert_large_cased``                          | | 24-layer, 1024-hidden, 16-heads, 373M parameters                                                                                    |
-|                   |                                                            | | FlauBERT large architecture                                                                                                         |
-|                   |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Bart              | ``facebook/bart-large``                                    | | 24-layer, 1024-hidden, 16-heads, 406M parameters                                                                                    |
-|                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_)                                                       |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``facebook/bart-large-mnli``                               | | Adds a 2 layer classification head with 1 million parameters                                                                        |
-|                   |                                                            | | bart-large base architecture with a classification head, finetuned on MNLI                                                          |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``facebook/bart-large-cnn``                                | | 12-layer, 1024-hidden, 16-heads, 406M parameters       (same as base)                                                               |
-|                   |                                                            | | bart-large base architecture finetuned on cnn summarization task                                                                    |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``facebook/mbart-large-en-ro``                             | | 12-layer, 1024-hidden, 16-heads, 880M parameters                                                                                    |
-|                   |                                                            | | bart-large architecture pretrained on cc25 multilingual data , finetuned on WMT english romanian translation.                       |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| DialoGPT          | ``DialoGPT-small``                                         | | 12-layer, 768-hidden, 12-heads, 124M parameters                                                                                     |
-|                   |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``DialoGPT-medium``                                        | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
-|                   |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``DialoGPT-large``                                         | | 36-layer, 1280-hidden, 20-heads, 774M parameters                                                                                    |
-|                   |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Reformer          | ``reformer-enwik8``                                        | | 12-layer, 1024-hidden, 8-heads, 149M parameters                                                                                     |
-|                   |                                                            | | Trained on English Wikipedia data - enwik8.                                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``reformer-crime-and-punishment``                          | | 6-layer, 256-hidden, 2-heads, 3M parameters                                                                                         |
-|                   |                                                            | | Trained on English text: Crime and Punishment novel by Fyodor Dostoyevsky.                                                          |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| MarianMT          | ``Helsinki-NLP/opus-mt-{src}-{tgt}``                       | | 12-layer, 512-hidden, 8-heads, ~74M parameter Machine translation models. Parameter counts vary depending on vocab size.            |
-|                   |                                                            | | (see `model list <https://huggingface.co/Helsinki-NLP>`_)                                                                           |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| Longformer        | ``allenai/longformer-base-4096``                           | | 12-layer, 768-hidden, 12-heads, ~149M parameters                                                                                    |
-|                   |                                                            | | Starting from RoBERTa-base checkpoint, trained on documents of max length 4,096                                                     |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``allenai/longformer-large-4096``                          | | 24-layer, 1024-hidden, 16-heads, ~435M parameters                                                                                   |
-|                   |                                                            | | Starting from RoBERTa-large checkpoint, trained on documents of max length 4,096                                                    |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Architecture       | Shortcut name                                              | Details of the model                                                                                                                  |
+====================+============================================================+=======================================================================================================================================+
+| BERT               | ``bert-base-uncased``                                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | Trained on lower-cased English text.                                                                                                |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``bert-large-uncased``                                     | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
+|                    |                                                            | | Trained on lower-cased English text.                                                                                                |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``bert-base-cased``                                        | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | Trained on cased English text.                                                                                                      |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``bert-large-cased``                                       | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
+|                    |                                                            | | Trained on cased English text.                                                                                                      |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``bert-base-multilingual-uncased``                         | | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters.                                                        |
+|                    |                                                            | | Trained on lower-cased text in the top 102 languages with the largest Wikipedias                                                    |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__).                                              |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``bert-base-multilingual-cased``                           | | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters.                                                             |
+|                    |                                                            | | Trained on cased text in the top 104 languages with the largest Wikipedias                                                          |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__).                                              |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``bert-base-chinese``                                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | Trained on cased Chinese Simplified and Traditional text.                                                                           |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``bert-base-german-cased``                                 | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | Trained on cased German text by Deepset.ai                                                                                          |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details on deepset.ai website <https://deepset.ai/german-bert>`__).                                                             |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``bert-large-uncased-whole-word-masking``                  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
+|                    |                                                            | | Trained on lower-cased English text using Whole-Word-Masking                                                                        |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__).                                                                    |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``bert-large-cased-whole-word-masking``                    | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
+|                    |                                                            | | Trained on cased English text using Whole-Word-Masking                                                                              |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__).                                                                    |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
+|                    |                                                            | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                             |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/transformers/tree/master/examples>`__).           |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``bert-large-cased-whole-word-masking-finetuned-squad``    | | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                                    |
+|                    |                                                            | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                               |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``bert-base-cased-finetuned-mrpc``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | The ``bert-base-cased`` model fine-tuned on MRPC                                                                                    |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/transformers/examples.html>`__)                           |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``bert-base-german-dbmdz-cased``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | Trained on cased German text by DBMDZ                                                                                               |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``bert-base-german-dbmdz-uncased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | Trained on uncased German text by DBMDZ                                                                                             |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``cl-tohoku/bert-base-japanese``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece and this requires some extra dependencies,                     |
+|                    |                                                            | | `fugashi <https://github.com/polm/fugashi>`__ which is a wrapper around `MeCab <https://taku910.github.io/mecab/>`__.               |
+|                    |                                                            | | Use ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install from source) to install them.                  |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``cl-tohoku/bert-base-japanese-whole-word-masking``        | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece and this requires some extra dependencies,                     |
+|                    |                                                            | | `fugashi <https://github.com/polm/fugashi>`__ which is a wrapper around `MeCab <https://taku910.github.io/mecab/>`__.               |
+|                    |                                                            | | Use ``pip install transformers["ja"]`` (or ``pip install -e .["ja"]`` if you install from source) to install them.                  |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``cl-tohoku/bert-base-japanese-char``                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                                        |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``cl-tohoku/bert-base-japanese-char-whole-word-masking``   | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                               |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``TurkuNLP/bert-base-finnish-cased-v1``                    | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | Trained on cased Finnish text.                                                                                                      |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``TurkuNLP/bert-base-finnish-uncased-v1``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | Trained on uncased Finnish text.                                                                                                    |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``wietsedv/bert-base-dutch-cased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | Trained on cased Dutch text.                                                                                                        |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details on wietsedv repository <https://github.com/wietsedv/bertje/>`__).                                                       |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| GPT                | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | OpenAI GPT English model                                                                                                            |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| GPT-2              | ``gpt2``                                                   | | 12-layer, 768-hidden, 12-heads, 117M parameters.                                                                                    |
+|                    |                                                            | | OpenAI GPT-2 English model                                                                                                          |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``gpt2-medium``                                            | | 24-layer, 1024-hidden, 16-heads, 345M parameters.                                                                                   |
+|                    |                                                            | | OpenAI's Medium-sized GPT-2 English model                                                                                           |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``gpt2-large``                                             | | 36-layer, 1280-hidden, 20-heads, 774M parameters.                                                                                   |
+|                    |                                                            | | OpenAI's Large-sized GPT-2 English model                                                                                            |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``gpt2-xl``                                                | | 48-layer, 1600-hidden, 25-heads, 1558M parameters.                                                                                  |
+|                    |                                                            | | OpenAI's XL-sized GPT-2 English model                                                                                               |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Transformer-XL     | ``transfo-xl-wt103``                                       | | 18-layer, 1024-hidden, 16-heads, 257M parameters.                                                                                   |
+|                    |                                                            | | English model trained on wikitext-103                                                                                               |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| XLNet              | ``xlnet-base-cased``                                       | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                    |                                                            | | XLNet English model                                                                                                                 |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``xlnet-large-cased``                                      | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
+|                    |                                                            | | XLNet Large English model                                                                                                           |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| XLM                | ``xlm-mlm-en-2048``                                        | | 12-layer, 2048-hidden, 16-heads                                                                                                     |
+|                    |                                                            | | XLM English model                                                                                                                   |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``xlm-mlm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
+|                    |                                                            | | XLM English-German model trained on the concatenation of English and German wikipedia                                               |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``xlm-mlm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
+|                    |                                                            | | XLM English-French model trained on the concatenation of English and French wikipedia                                               |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``xlm-mlm-enro-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
+|                    |                                                            | | XLM English-Romanian Multi-language model                                                                                           |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``xlm-mlm-xnli15-1024``                                    | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+|                    |                                                            | | XLM Model pre-trained with MLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                             |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``xlm-mlm-tlm-xnli15-1024``                                | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+|                    |                                                            | | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                       |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``xlm-clm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
+|                    |                                                            | | XLM English-French model trained with CLM (Causal Language Modeling) on the concatenation of English and French wikipedia           |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``xlm-clm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
+|                    |                                                            | | XLM English-German model trained with CLM (Causal Language Modeling) on the concatenation of English and German wikipedia           |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``xlm-mlm-17-1280``                                        | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
+|                    |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 17 languages.                                                              |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``xlm-mlm-100-1280``                                       | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
+|                    |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 100 languages.                                                             |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| RoBERTa            | ``roberta-base``                                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
+|                    |                                                            | | RoBERTa using the BERT-base architecture                                                                                            |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``roberta-large``                                          | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
+|                    |                                                            | | RoBERTa using the BERT-large architecture                                                                                           |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``roberta-large-mnli``                                     | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
+|                    |                                                            | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__.                                            |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``distilroberta-base``                                     | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
+|                    |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``roberta-base-openai-detector``                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
+|                    |                                                            | | ``roberta-base`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                             |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``roberta-large-openai-detector``                          | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
+|                    |                                                            | | ``roberta-large`` fine-tuned by OpenAI on the outputs of the 1.5B-parameter GPT-2 model.                                            |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/openai/gpt-2-output-dataset/tree/master/detector>`__)                                               |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| DistilBERT         | ``distilbert-base-uncased``                                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
+|                    |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint                                                   |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``distilbert-base-uncased-distilled-squad``                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
+|                    |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``distilbert-base-cased``                                  | | 6-layer, 768-hidden, 12-heads, 65M parameters                                                                                       |
+|                    |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint                                                     |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``distilbert-base-cased-distilled-squad``                  | | 6-layer, 768-hidden, 12-heads, 65M parameters                                                                                       |
+|                    |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-cased` checkpoint, with an additional question answering layer.       |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``distilgpt2``                                             | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
+|                    |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``distilbert-base-german-cased``                           | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
+|                    |                                                            | | The German DistilBERT model distilled from the German DBMDZ BERT model `bert-base-german-dbmdz-cased` checkpoint.                   |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``distilbert-base-multilingual-cased``                     | | 6-layer, 768-hidden, 12-heads, 134M parameters                                                                                      |
+|                    |                                                            | | The multilingual DistilBERT model distilled from the Multilingual BERT model `bert-base-multilingual-cased` checkpoint.             |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| CTRL               | ``ctrl``                                                   | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters                                                                                    |
+|                    |                                                            | | Salesforce's Large-sized CTRL English model                                                                                         |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| CamemBERT          | ``camembert-base``                                         | | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                                     |
+|                    |                                                            | | CamemBERT using the BERT-base architecture                                                                                          |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/camembert>`__)                                                 |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| ALBERT             | ``albert-base-v1``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
+|                    |                                                            | | ALBERT base model                                                                                                                   |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``albert-large-v1``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
+|                    |                                                            | | ALBERT large model                                                                                                                  |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``albert-xlarge-v1``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
+|                    |                                                            | | ALBERT xlarge model                                                                                                                 |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``albert-xxlarge-v1``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
+|                    |                                                            | | ALBERT xxlarge model                                                                                                                |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``albert-base-v2``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
+|                    |                                                            | | ALBERT base model with no dropout, additional training data and longer training                                                     |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``albert-large-v2``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
+|                    |                                                            | | ALBERT large model with no dropout, additional training data and longer training                                                    |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``albert-xlarge-v2``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
+|                    |                                                            | | ALBERT xlarge model with no dropout, additional training data and longer training                                                   |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``albert-xxlarge-v2``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
+|                    |                                                            | | ALBERT xxlarge model with no dropout, additional training data and longer training                                                  |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| T5                 | ``t5-small``                                               | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads,                                           |
+|                    |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``t5-base``                                                | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads,                                        |
+|                    |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``t5-large``                                               | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
+|                    |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``t5-3B``                                                  | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads,                                      |
+|                    |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``t5-11B``                                                 | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads,                                      |
+|                    |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| XLM-RoBERTa        | ``xlm-roberta-base``                                       | | ~125M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 8-heads,                                         |
+|                    |                                                            | | Trained on on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                       |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``xlm-roberta-large``                                      | | ~355M parameters with 24-layers, 1027-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
+|                    |                                                            | | Trained on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                          |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| FlauBERT           | ``flaubert/flaubert_small_cased``                          | | 6-layer, 512-hidden, 8-heads, 54M parameters                                                                                        |
+|                    |                                                            | | FlauBERT small architecture                                                                                                         |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``flaubert/flaubert_base_uncased``                         | | 12-layer, 768-hidden, 12-heads, 137M parameters                                                                                     |
+|                    |                                                            | | FlauBERT base architecture with uncased vocabulary                                                                                  |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``flaubert/flaubert_base_cased``                           | | 12-layer, 768-hidden, 12-heads, 138M parameters                                                                                     |
+|                    |                                                            | | FlauBERT base architecture with cased vocabulary                                                                                    |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``flaubert/flaubert_large_cased``                          | | 24-layer, 1024-hidden, 16-heads, 373M parameters                                                                                    |
+|                    |                                                            | | FlauBERT large architecture                                                                                                         |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/getalp/Flaubert>`__)                                                                                |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Bart               | ``facebook/bart-large``                                    | | 24-layer, 1024-hidden, 16-heads, 406M parameters                                                                                    |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_)                                                       |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``facebook/bart-base``                                     | | 12-layer, 768-hidden, 16-heads, 139M parameters                                                                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``facebook/bart-large-mnli``                               | | Adds a 2 layer classification head with 1 million parameters                                                                        |
+|                    |                                                            | | bart-large base architecture with a classification head, finetuned on MNLI                                                          |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``facebook/bart-large-cnn``                                | | 12-layer, 1024-hidden, 16-heads, 406M parameters       (same as base)                                                               |
+|                    |                                                            | | bart-large base architecture finetuned on cnn summarization task                                                                    |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| DialoGPT           | ``DialoGPT-small``                                         | | 12-layer, 768-hidden, 12-heads, 124M parameters                                                                                     |
+|                    |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``DialoGPT-medium``                                        | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
+|                    |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``DialoGPT-large``                                         | | 36-layer, 1280-hidden, 20-heads, 774M parameters                                                                                    |
+|                    |                                                            | | Trained on English text: 147M conversation-like exchanges extracted from Reddit.                                                    |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Reformer           | ``reformer-enwik8``                                        | | 12-layer, 1024-hidden, 8-heads, 149M parameters                                                                                     |
+|                    |                                                            | | Trained on English Wikipedia data - enwik8.                                                                                         |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``reformer-crime-and-punishment``                          | | 6-layer, 256-hidden, 2-heads, 3M parameters                                                                                         |
+|                    |                                                            | | Trained on English text: Crime and Punishment novel by Fyodor Dostoyevsky.                                                          |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| MarianMT           | ``Helsinki-NLP/opus-mt-{src}-{tgt}``                       | | 12-layer, 512-hidden, 8-heads, ~74M parameter Machine translation models. Parameter counts vary depending on vocab size.            |
+|                    |                                                            | | (see `model list <https://huggingface.co/Helsinki-NLP>`_)                                                                           |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Pegasus            | ``google/pegasus-{dataset}``                               | | 16-layer, 1024-hidden, 16-heads, ~568M parameter, 2.2 GB for summary. `model list <https://huggingface.co/models?search=pegasus>`__ |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Longformer         | ``allenai/longformer-base-4096``                           | | 12-layer, 768-hidden, 12-heads, ~149M parameters                                                                                    |
+|                    |                                                            | | Starting from RoBERTa-base checkpoint, trained on documents of max length 4,096                                                     |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``allenai/longformer-large-4096``                          | | 24-layer, 1024-hidden, 16-heads, ~435M parameters                                                                                   |
+|                    |                                                            | | Starting from RoBERTa-large checkpoint, trained on documents of max length 4,096                                                    |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| MBart              | ``facebook/mbart-large-cc25``                              | | 24-layer, 1024-hidden, 16-heads, 610M parameters                                                                                    |
+|                    |                                                            | | mBART (bart-large architecture) model trained on 25 languages' monolingual corpus                                                   |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``facebook/mbart-large-en-ro``                             | | 24-layer, 1024-hidden, 16-heads, 610M parameters                                                                                    |
+|                    |                                                            | | mbart-large-cc25 model finetuned on WMT english romanian translation.                                                               |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Lxmert             | ``lxmert-base-uncased``                                    | | 9-language layers, 9-relationship layers, and 12-cross-modality layers                                                              |
+|                    |                                                            | | 768-hidden, 12-heads (for each layer) ~ 228M parameters                                                                             |
+|                    |                                                            | | Starting from lxmert-base checkpoint, trained on over 9 million image-text couplets from COCO, VisualGenome, GQA, VQA               |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Funnel Transformer | ``funnel-transformer/small``                               | | 14 layers: 3 blocks of 4 layers then 2 layers decoder, 768-hidden, 12-heads, 130M parameters                                        |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``funnel-transformer/small-base``                          | | 12 layers: 3 blocks of 4 layers (no decoder), 768-hidden, 12-heads, 115M parameters                                                 |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``funnel-transformer/medium``                              | | 14 layers: 3 blocks 6, 3x2, 3x2 layers then 2 layers decoder, 768-hidden, 12-heads, 130M parameters                                 |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``funnel-transformer/medium-base``                         | | 12 layers: 3 blocks 6, 3x2, 3x2 layers(no decoder), 768-hidden, 12-heads, 115M parameters                                           |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``funnel-transformer/intermediate``                        | | 20 layers: 3 blocks of 6 layers then 2 layers decoder, 768-hidden, 12-heads, 177M parameters                                        |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``funnel-transformer/intermediate-base``                   | | 18 layers: 3 blocks of 6 layers (no decoder), 768-hidden, 12-heads, 161M parameters                                                 |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``funnel-transformer/large``                               | | 26 layers: 3 blocks of 8 layers then 2 layers decoder, 1024-hidden, 12-heads, 386M parameters                                       |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``funnel-transformer/large-base``                          | | 24 layers: 3 blocks of 8 layers (no decoder), 1024-hidden, 12-heads, 358M parameters                                                |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``funnel-transformer/xlarge``                              | | 32 layers: 3 blocks of 10 layers then 2 layers decoder, 1024-hidden, 12-heads, 468M parameters                                      |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
+|                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``funnel-transformer/xlarge-base``                         | | 30 layers: 3 blocks of 10 layers (no decoder), 1024-hidden, 12-heads, 440M parameters                                               |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/laiguokun/Funnel-Transformer>`__)                                                                   |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| LayoutLM           | ``microsoft/layoutlm-base-uncased``                        | | 12 layers, 768-hidden, 12-heads, 113M parameters                                                                                    |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/microsoft/unilm/tree/master/layoutlm>`__)                                                           |
+                    +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                    | ``microsoft/layoutlm-large-uncased``                       | | 24 layers, 1024-hidden, 16-heads, 343M parameters                                                                                   |
+|                    |                                                            |                                                                                                                                       |
+|                    |                                                            | (see `details <https://github.com/microsoft/unilm/tree/master/layoutlm>`__)                                                           |
+--------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -1,222 +0,0 @@
-# Quickstart
-
-## Philosophy
-
-Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
-
-The library was designed with two strong goals in mind:
-
- be as easy and fast to use as possible:
-
-  - we strongly limited the number of user-facing abstractions to learn, in fact, there are almost no abstractions, just three standard classes required to use each model: configuration, models and tokenizer,
-  - all of these classes can be initialized in a simple and unified way from pretrained instances by using a common `from_pretrained()` instantiation method which will take care of downloading (if needed), caching and loading the related class from a pretrained instance supplied in the library or your own saved instance.
-  - as a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to extend/build-upon the library, just use regular Python/PyTorch modules and inherit from the base classes of the library to reuse functionalities like model loading/saving.
-
- provide state-of-the-art models with performances as close as possible to the original models:
-
-  - we provide at least one example for each architecture which reproduces a result provided by the official authors of said architecture,
-  - the code is usually as close to the original code base as possible which means some PyTorch code may be not as *pytorchic* as it could be as a result of being converted TensorFlow code.
-
-A few other goals:
-
- expose the models' internals as consistently as possible:
-
-  - we give access, using a single API to the full hidden-states and attention weights,
-  - tokenizer and base model's API are standardized to easily switch between models.
-
- incorporate a subjective selection of promising tools for fine-tuning/investigating these models:
-
-  - a simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning,
-  - simple ways to mask and prune transformer heads.
-
-## Main concepts
-
-The library is build around three types of classes for each model:
-
- **model classes**  e.g., `BertModel` which are 20+ PyTorch models (`torch.nn.Modules`) that work with the pretrained weights provided in the library. In TF2, these are `tf.keras.Model`.
- **configuration classes** which store all the parameters required to build a model, e.g., `BertConfig`. You don't always need to instantiate these your-self. In particular, if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in a list of token embeddings indices to be fed to a model, e.g., `BertTokenizer`
-
-All these classes can be instantiated from pretrained instances and saved locally using two methods:
-
- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
- `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
-
-We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized into two parts:
-
- the **MAIN CLASSES** section details the common functionalities/method/attributes of the three main type of classes (configuration, model, tokenizer) plus some optimization related classes provided as utilities for training,
- the **PACKAGE REFERENCE** section details all the variants of each class for each model architectures and, in particular, the input/output that you should expect when calling each of them.
-
-## Quick tour: Usage
-
-Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.
-
-See the full API reference for examples of each model class.
-
-### BERT example
-
-Let's start by preparing a tokenized input (a list of token embeddings indices to be fed to Bert) from a text string using `BertTokenizer`
-
-```python
-import torch
-from transformers import BertTokenizer, BertModel, BertForMaskedLM
-
-# OPTIONAL: if you want to have more information on what's happening under the hood, activate the logger as follows
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# Load pre-trained model tokenizer (vocabulary)
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-
-# Tokenize input
-text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-tokenized_text = tokenizer.tokenize(text)
-
-# Mask a token that we will try to predict back with `BertForMaskedLM`
-masked_index = 8
-tokenized_text[masked_index] = '[MASK]'
-assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
-
-# Convert token to vocabulary indices
-indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-# Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
-segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-# Convert inputs to PyTorch tensors
-tokens_tensor = torch.tensor([indexed_tokens])
-segments_tensors = torch.tensor([segments_ids])
-```
-
-Let's see how we can use `BertModel` to encode our inputs in hidden-states:
-
-```python
-# Load pre-trained model (weights)
-model = BertModel.from_pretrained('bert-base-uncased')
-
-# Set the model in evaluation mode to deactivate the DropOut modules
-# This is IMPORTANT to have reproducible results during evaluation!
-model.eval()
-
-# If you have a GPU, put everything on cuda
-tokens_tensor = tokens_tensor.to('cuda')
-segments_tensors = segments_tensors.to('cuda')
-model.to('cuda')
-
-# Predict hidden states features for each layer
-with torch.no_grad():
-    # See the models docstrings for the detail of the inputs
-    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
-    # Transformers models always output tuples.
-    # See the models docstrings for the detail of all the outputs
-    # In our case, the first element is the hidden state of the last layer of the Bert model
-    encoded_layers = outputs[0]
-# We have encoded our input sequence in a FloatTensor of shape (batch size, sequence length, model hidden dimension)
-assert tuple(encoded_layers.shape) == (1, len(indexed_tokens), model.config.hidden_size)
-```
-
-And how to use `BertForMaskedLM` to predict a masked token:
-
-```python
-# Load pre-trained model (weights)
-model = BertForMaskedLM.from_pretrained('bert-base-uncased')
-model.eval()
-
-# If you have a GPU, put everything on cuda
-tokens_tensor = tokens_tensor.to('cuda')
-segments_tensors = segments_tensors.to('cuda')
-model.to('cuda')
-
-# Predict all tokens
-with torch.no_grad():
-    outputs = model(tokens_tensor, token_type_ids=segments_tensors)
-    predictions = outputs[0]
-
-# confirm we were able to predict 'henson'
-predicted_index = torch.argmax(predictions[0, masked_index]).item()
-predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-assert predicted_token == 'henson'
-```
-
-### OpenAI GPT-2
-
-Here is a quick-start example using `GPT2Tokenizer` and `GPT2LMHeadModel` class with OpenAI's pre-trained model to predict the next token from a text prompt.
-
-First let's prepare a tokenized input from our text string using `GPT2Tokenizer`
-
-```python
-import torch
-from transformers import GPT2Tokenizer, GPT2LMHeadModel
-
-# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
-import logging
-logging.basicConfig(level=logging.INFO)
-
-# Load pre-trained model tokenizer (vocabulary)
-tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-
-# Encode a text inputs
-text = "Who was Jim Henson ? Jim Henson was a"
-indexed_tokens = tokenizer.encode(text)
-
-# Convert indexed tokens in a PyTorch tensor
-tokens_tensor = torch.tensor([indexed_tokens])
-```
-
-Let's see how to use `GPT2LMHeadModel` to generate the next token following our text:
-
-```python
-# Load pre-trained model (weights)
-model = GPT2LMHeadModel.from_pretrained('gpt2')
-
-# Set the model in evaluation mode to deactivate the DropOut modules
-# This is IMPORTANT to have reproducible results during evaluation!
-model.eval()
-
-# If you have a GPU, put everything on cuda
-tokens_tensor = tokens_tensor.to('cuda')
-model.to('cuda')
-
-# Predict all tokens
-with torch.no_grad():
-    outputs = model(tokens_tensor)
-    predictions = outputs[0]
-
-# get the predicted next sub-word (in our case, the word 'man')
-predicted_index = torch.argmax(predictions[0, -1, :]).item()
-predicted_text = tokenizer.decode(indexed_tokens + [predicted_index])
-assert predicted_text == 'Who was Jim Henson? Jim Henson was a man'
-```
-
-Examples for each model class of each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [documentation](#documentation).
-
-#### Using the past
-
-GPT-2, as well as some other models (GPT, XLNet, Transfo-XL, CTRL), make use of a `past` or `mems` attribute which can be used to prevent re-computing the key/value pairs when using sequential decoding. It is useful when generating sequences as a big part of the attention mechanism benefits from previous computations.
-
-Here is a fully-working example using the `past` with `GPT2LMHeadModel` and argmax decoding (which should only be used as an example, as argmax decoding introduces a lot of repetition):
-
-```python
-from transformers import GPT2LMHeadModel, GPT2Tokenizer
-import torch
-
-tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-model = GPT2LMHeadModel.from_pretrained('gpt2')
-
-generated = tokenizer.encode("The Manhattan bridge")
-context = torch.tensor([generated])
-past = None
-
-for i in range(100):
-    print(i)
-    output, past = model(context, past=past)
-    token = torch.argmax(output[..., -1, :])
-
-    generated += [token.tolist()]
-    context = token.unsqueeze(0)
-
-sequence = tokenizer.decode(generated)
-
-print(sequence)
-```
-
-The model only requires a single token as input as all the previous tokens' key/value pairs are contained in the `past`.
--- a/docs/source/quicktour.rst
+++ b/docs/source/quicktour.rst
@@ -0,0 +1,404 @@
+Quick tour
+==========
+
+Let's have a quick look at the 🤗 Transformers library features. The library downloads pretrained models for
+Natural Language Understanding (NLU) tasks, such as analyzing the sentiment of a text, and Natural Language Generation (NLG),
+such as completing a prompt with new text or translating in another language.
+
+First we will see how to easily leverage the pipeline API to quickly use those pretrained models at inference. Then, we
+will dig a little bit more and see how the library gives you access to those models and helps you preprocess your data.
+
+.. note::
+
+    All code examples presented in the documentation have a switch on the top left for Pytorch versus TensorFlow. If
+    not, the code is expected to work for both backends without any change needed.
+
+Getting started on a task with a pipeline
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The easiest way to use a pretrained model on a given task is to use :func:`~transformers.pipeline`. 🤗 Transformers
+provides the following tasks out of the box:
+
+- Sentiment analysis: is a text positive or negative?
+- Text generation (in English): provide a prompt and the model will generate what follows.
+- Name entity recognition (NER): in an input sentence, label each word with the entity it represents (person, place,
+  etc.)
+- Question answering: provide the model with some context and a question, extract the answer from the context.
+- Filling masked text: given a text with masked words (e.g., replaced by ``[MASK]``), fill the blanks.
+- Summarization: generate a summary of a long text.
+- Translation: translate a text in another language.
+- Feature extraction: return a tensor representation of the text.
+
+Let's see how this work for sentiment analysis (the other tasks are all covered in the
+:doc:`task summary </task_summary>`):
+
+.. code-block::
+
+    >>> from transformers import pipeline
+    >>> classifier = pipeline('sentiment-analysis')
+
+When typing this command for the first time, a pretrained model and its tokenizer are downloaded and cached. We will
+look at both later on, but as an introduction the tokenizer's job is to preprocess the text for the model, which is
+then responsible for making predictions. The pipeline groups all of that together, and post-process the predictions to
+make them readable. For instance:
+
+
+.. code-block::
+
+    >>> classifier('We are very happy to show you the 🤗 Transformers library.')
+    [{'label': 'POSITIVE', 'score': 0.9997795224189758}]
+
+That's encouraging! You can use it on a list of sentences, which will be preprocessed then fed to the model as a
+`batch`, returning a list of dictionaries like this one:
+
+.. code-block::
+
+    >>> results = classifier(["We are very happy to show you the 🤗 Transformers library.",
+    ...            "We hope you don't hate it."])
+    >>> for result in results:
+    ...     print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+    label: POSITIVE, with score: 0.9998
+    label: NEGATIVE, with score: 0.5309
+
+You can see the second sentence has been classified as negative (it needs to be positive or negative) but its score is
+fairly neutral.
+
+By default, the model downloaded for this pipeline is called "distilbert-base-uncased-finetuned-sst-2-english". We can
+look at its `model page <https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english>`__ to get more
+information about it. It uses the :doc:`DistilBERT architecture </model_doc/distilbert>` and has been fine-tuned on a
+dataset called SST-2 for the sentiment analysis task.
+
+Let's say we want to use another model; for instance, one that has been trained on French data. We can search through
+the `model hub <https://huggingface.co/models>`__ that gathers models pretrained on a lot of data by research labs, but
+also community models (usually fine-tuned versions of those big models on a specific dataset). Applying the tags
+"French" and "text-classification" gives back a suggestion "nlptown/bert-base-multilingual-uncased-sentiment". Let's
+see how we can use it.
+
+You can directly pass the name of the model to use to :func:`~transformers.pipeline`:
+
+.. code-block::
+
+    >>> classifier = pipeline('sentiment-analysis', model="nlptown/bert-base-multilingual-uncased-sentiment")
+
+This classifier can now deal with texts in English, French, but also Dutch, German, Italian and Spanish! You can also
+replace that name by a local folder where you have saved a pretrained model (see below). You can also pass a model
+object and its associated tokenizer.
+
+We will need two classes for this. The first is :class:`~transformers.AutoTokenizer`, which we will use to download the
+tokenizer associated to the model we picked and instantiate it. The second is
+:class:`~transformers.AutoModelForSequenceClassification` (or
+:class:`~transformers.TFAutoModelForSequenceClassification` if you are using TensorFlow), which we will use to download
+the model itself. Note that if we were using the library on an other task, the class of the model would change. The
+:doc:`task summary </task_summary>` tutorial summarizes which class is used for which task.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+
+Now, to download the models and tokenizer we found previously, we just have to use the
+:func:`~transformers.AutoModelForSequenceClassification.from_pretrained` method (feel free to replace ``model_name`` by
+any other model from the model hub):
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+    >>> model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+    >>> classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
+    >>> ## TENSORFLOW CODE
+    >>> model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
+    >>> # This model only exists in PyTorch, so we use the `from_pt` flag to import that model in TensorFlow.
+    >>> model = TFAutoModelForSequenceClassification.from_pretrained(model_name, from_pt=True)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+    >>> classifier = pipeline('sentiment-analysis', model=model, tokenizer=tokenizer)
+
+If you don't find a model that has been pretrained on some data similar to yours, you will need to fine-tune a
+pretrained model on your data. We provide :doc:`example scripts </examples>` to do so. Once you're done, don't forget
+to share your fine-tuned model on the hub with the community, using :doc:`this tutorial </model_sharing>`.
+
+.. _pretrained-model:
+
+Under the hood: pretrained models
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Let's now see what happens beneath the hood when using those pipelines. As we saw, the model and tokenizer are created
+using the :obj:`from_pretrained` method:
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> pt_model = AutoModelForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> tf_model = TFAutoModelForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+Using the tokenizer
+^^^^^^^^^^^^^^^^^^^
+
+We mentioned the tokenizer is responsible for the preprocessing of your texts. First, it will split a given text in
+words (or part of words, punctuation symbols, etc.) usually called `tokens`. There are multiple rules that can govern
+that process (you can learn more about them in the :doc:`tokenizer summary <tokenizer_summary>`, which is why we need
+to instantiate the tokenizer using the name of the model, to make sure we use the same rules as when the model was
+pretrained.
+
+The second step is to convert those `tokens` into numbers, to be able to build a tensor out of them and feed them to
+the model. To do this, the tokenizer has a `vocab`, which is the part we download when we instantiate it with the
+:obj:`from_pretrained` method, since we need to use the same `vocab` as when the model was pretrained.
+
+To apply these steps on a given text, we can just feed it to our tokenizer:
+
+.. code-block::
+
+    >>> inputs = tokenizer("We are very happy to show you the 🤗 Transformers library.")
+
+This returns a dictionary string to list of ints. It contains the `ids of the tokens <glossary.html#input-ids>`__,
+as mentioned before, but also additional arguments that will be useful to the model. Here for instance, we also have an
+`attention mask <glossary.html#attention-mask>`__ that the model will use to have a better understanding of the sequence:
+
+
+.. code-block::
+
+    >>> print(inputs)
+    {'input_ids': [101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
+
+You can pass a list of sentences directly to your tokenizer. If your goal is to send them through your model as a
+batch, you probably want to pad them all to the same length, truncate them to the maximum length the model can accept
+and get tensors back. You can specify all of that to the tokenizer:
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> pt_batch = tokenizer(
+    ...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+    ...     padding=True,
+    ...     truncation=True,
+    ...     return_tensors="pt"
+    ... )
+    >>> ## TENSORFLOW CODE
+    >>> tf_batch = tokenizer(
+    ...     ["We are very happy to show you the 🤗 Transformers library.", "We hope you don't hate it."],
+    ...     padding=True,
+    ...     truncation=True,
+    ...     return_tensors="tf"
+    ... )
+
+The padding is automatically applied on the side expected by the model (in this case, on the right), with the
+padding token the model was pretrained with. The attention mask is also adapted to take the padding into account:
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> for key, value in pt_batch.items():
+    ...     print(f"{key}: {value.numpy().tolist()}")
+    input_ids: [[101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], [101, 2057, 3246, 2017, 2123, 1005, 1056, 5223, 2009, 1012, 102, 0, 0, 0]]
+    attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]
+    >>> ## TENSORFLOW CODE
+    >>> for key, value in tf_batch.items():
+    ...     print(f"{key}: {value.numpy().tolist()}")
+    input_ids: [[101, 2057, 2024, 2200, 3407, 2000, 2265, 2017, 1996, 100, 19081, 3075, 1012, 102], [101, 2057, 3246, 2017, 2123, 1005, 1056, 5223, 2009, 1012, 102, 0, 0, 0]]
+    attention_mask: [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]]
+
+You can learn more about tokenizers :doc:`here <preprocessing>`.
+
+Using the model
+^^^^^^^^^^^^^^^
+
+Once your input has been preprocessed by the tokenizer, you can send it directly to the model. As we mentioned, it will
+contain all the relevant information the model needs. If you're using a TensorFlow model, you can pass the
+dictionary keys directly to tensors, for a PyTorch model, you need to unpack the dictionary by adding :obj:`**`.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> pt_outputs = pt_model(**pt_batch)
+    >>> ## TENSORFLOW CODE
+    >>> tf_outputs = tf_model(tf_batch)
+
+In 🤗 Transformers, all outputs are tuples (with only one element potentially). Here, we get a tuple with just the
+final activations of the model.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> print(pt_outputs)
+    (tensor([[-4.0833,  4.3364],
+            [ 0.0818, -0.0418]], grad_fn=<AddmmBackward>),)
+    >>> ## TENSORFLOW CODE
+    >>> print(tf_outputs)
+    (<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
+    array([[-4.0832963 ,  4.336414  ],
+           [ 0.08181786, -0.04179301]], dtype=float32)>,)
+
+The model can return more than just the final activations, which is why the output is a tuple. Here we only asked for
+the final activations, so we get a tuple with one element.
+.. note::
+
+    All 🤗 Transformers models (PyTorch or TensorFlow) return the activations of the model *before* the final
+    activation function (like SoftMax) since this final activation function is often fused with the loss.
+
+Let's apply the SoftMax activation to get predictions.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> import torch.nn.functional as F
+    >>> pt_predictions = F.softmax(pt_outputs[0], dim=-1)
+    >>> ## TENSORFLOW CODE
+    >>> import tensorflow as tf
+    >>> tf_predictions = tf.nn.softmax(tf_outputs[0], axis=-1)
+
+We can see we get the numbers from before:
+
+.. code-block::
+
+    >>> ## TENSORFLOW CODE
+    >>> print(tf_predictions)
+    tf.Tensor(
+    [[2.2042994e-04 9.9977952e-01]
+     [5.3086340e-01 4.6913657e-01]], shape=(2, 2), dtype=float32)
+    >>> ## PYTORCH CODE
+    >>> print(pt_predictions)
+    tensor([[2.2043e-04, 9.9978e-01],
+            [5.3086e-01, 4.6914e-01]], grad_fn=<SoftmaxBackward>)
+
+If you have labels, you can provide them to the model, it will return a tuple with the loss and the final activations.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> import torch
+    >>> pt_outputs = pt_model(**pt_batch, labels = torch.tensor([1, 0]))
+    >>> ## TENSORFLOW CODE
+    >>> import tensorflow as tf
+    >>> tf_outputs = tf_model(tf_batch, labels = tf.constant([1, 0]))
+
+Models are standard `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__ or
+`tf.keras.Model <https://www.tensorflow.org/api_docs/python/tf/keras/Model>`__ so you can use them in your usual
+training loop. 🤗 Transformers also provides a :class:`~transformers.Trainer` (or :class:`~transformers.TFTrainer` if
+you are using TensorFlow) class to help with your training (taking care of things such as distributed training, mixed
+precision, etc.). See the :doc:`training tutorial <training>` for more details.
+
+.. note::
+
+    Pytorch model outputs are special dataclasses so that you can get autocompletion for their attributes in an IDE.
+    They also behave like a tuple or a dictionary (e.g., you can index with an integer, a slice or a string) in which
+    case the attributes not set (that have :obj:`None` values) are ignored.
+
+Once your model is fine-tuned, you can save it with its tokenizer in the following way:
+
+.. code-block::
+
+    tokenizer.save_pretrained(save_directory)
+    model.save_pretrained(save_directory)
+
+You can then load this model back using the :func:`~transformers.AutoModel.from_pretrained` method by passing the
+directory name instead of the model name. One cool feature of 🤗 Transformers is that you can easily switch between
+PyTorch and TensorFlow: any model saved as before can be loaded back either in PyTorch or TensorFlow. If you are
+loading a saved PyTorch model in a TensorFlow model, use :func:`~transformers.TFAutoModel.from_pretrained` like this:
+
+.. code-block::
+
+    tokenizer = AutoTokenizer.from_pretrained(save_directory)
+    model = TFAutoModel.from_pretrained(save_directory, from_pt=True)
+
+and if you are loading a saved TensorFlow model in a PyTorch model, you should use the following code:
+
+.. code-block::
+
+    tokenizer = AutoTokenizer.from_pretrained(save_directory)
+    model = AutoModel.from_pretrained(save_directory, from_tf=True)
+
+Lastly, you can also ask the model to return all hidden states and all attention weights if you need them:
+
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> pt_outputs = pt_model(**pt_batch, output_hidden_states=True, output_attentions=True)
+    >>> all_hidden_states, all_attentions = pt_outputs[-2:]
+    >>> ## TENSORFLOW CODE
+    >>> tf_outputs = tf_model(tf_batch, output_hidden_states=True, output_attentions=True)
+    >>> all_hidden_states, all_attentions = tf_outputs[-2:]
+
+Accessing the code
+^^^^^^^^^^^^^^^^^^
+
+The :obj:`AutoModel` and :obj:`AutoTokenizer` classes are just shortcuts that will automatically work with any
+pretrained model. Behind the scenes, the library has one model class per combination of architecture plus class, so the
+code is easy to access and tweak if you need to.
+
+In our previous example, the model was called "distilbert-base-uncased-finetuned-sst-2-english", which means it's
+using the :doc:`DistilBERT </model_doc/distilbert>` architecture. As
+:class:`~transformers.AutoModelForSequenceClassification` (or :class:`~transformers.TFAutoModelForSequenceClassification`
+if you are using TensorFlow) was used, the model automatically created is then a
+:class:`~transformers.DistilBertForSequenceClassification`. You can look at its documentation for all details relevant
+to that specific model, or browse the source code. This is how you would directly instantiate model and tokenizer
+without the auto magic:
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> model = DistilBertForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased-finetuned-sst-2-english"
+    >>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+
+Customizing the model
+^^^^^^^^^^^^^^^^^^^^^
+
+If you want to change how the model itself is built, you can define your custom configuration class. Each architecture
+comes with its own relevant configuration (in the case of DistilBERT, :class:`~transformers.DistilBertConfig`) which
+allows you to specify any of the hidden dimension, dropout rate, etc. If you do core modifications, like changing the
+hidden size, you won't be able to use a pretrained model anymore and will need to train from scratch. You would then
+instantiate the model directly from this configuration.
+
+Here we use the predefined vocabulary of DistilBERT (hence load the tokenizer with the
+:func:`~transformers.DistilBertTokenizer.from_pretrained` method) and initialize the model from scratch (hence
+instantiate the model from the configuration instead of using the
+:func:`~transformers.DistilBertForSequenceClassification.from_pretrained` method).
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
+    >>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+    >>> model = DistilBertForSequenceClassification(config)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
+    >>> config = DistilBertConfig(n_heads=8, dim=512, hidden_dim=4*512)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
+    >>> model = TFDistilBertForSequenceClassification(config)
+
+For something that only changes the head of the model (for instance, the number of labels), you can still use a
+pretrained model for the body. For instance, let's define a classifier for 10 different labels using a pretrained body.
+We could create a configuration with all the default values and just change the number of labels, but more easily, you
+can directly pass any argument a configuration would take to the :func:`from_pretrained` method and it will update the
+default configuration with it:
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, DistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased"
+    >>> model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import DistilBertConfig, DistilBertTokenizer, TFDistilBertForSequenceClassification
+    >>> model_name = "distilbert-base-uncased"
+    >>> model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=10)
+    >>> tokenizer = DistilBertTokenizer.from_pretrained(model_name)
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -1,190 +1,251 @@
-Loading Google AI or OpenAI pre-trained weights or PyTorch dump
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+**********************************************
+Exporting transformers models
+**********************************************

-``from_pretrained()`` method
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+ONNX / ONNXRuntime
+==============================================

-To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of ``BertForPreTraining`` saved with ``torch.save()``\ ), the PyTorch model classes and the tokenizer can be instantiated using the ``from_pretrained()`` method:
+Projects `ONNX (Open Neural Network eXchange) <http://onnx.ai>`_ and `ONNXRuntime (ORT) <https://microsoft.github.io/onnxruntime/>`_ are part of an effort from leading industries in the AI field
+to provide a unified and community-driven format to store and, by extension, efficiently execute neural network leveraging a variety
+of hardware and dedicated optimizations.
+
+Starting from transformers v2.10.0 we partnered with ONNX Runtime to provide an easy export of transformers models to
+the ONNX format. You can have a look at the effort by looking at our joint blog post `Accelerate your NLP pipelines using
+Hugging Face Transformers and ONNX Runtime <https://medium.com/microsoftazure/accelerate-your-nlp-pipelines-using-hugging-face-transformers-and-onnx-runtime-2443578f4333>`_.
+
+Exporting a model is done through the script `convert_graph_to_onnx.py` at the root of the transformers sources.
+The following command shows how easy it is to export a BERT model from the library, simply run:
+
+.. code-block:: bash
+
+    python convert_graph_to_onnx.py --framework <pt, tf> --model bert-base-cased bert-base-cased.onnx
+
+The conversion tool works for both PyTorch and Tensorflow models and ensures:
+
+* The model and its weights are correctly initialized from the Hugging Face model hub or a local checkpoint.
+* The inputs and outputs are correctly generated to their ONNX counterpart.
+* The generated model can be correctly loaded through onnxruntime.
+
+.. note::
+    Currently, inputs and outputs are always exported with dynamic sequence axes preventing some optimizations
+    on the ONNX Runtime. If you would like to see such support for fixed-length inputs/outputs, please
+    open up an issue on transformers.
+
+
+Also, the conversion tool supports different options which let you tune the behavior of the generated model:
+
+* **Change the target opset version of the generated model.**  (More recent opset generally supports more operators and enables faster inference)
+
+* **Export pipeline-specific prediction heads.**  (Allow to export model along with its task-specific prediction head(s))
+
+* **Use the external data format (PyTorch only).**  (Lets you export model which size is above 2Gb (`More info <https://github.com/pytorch/pytorch/pull/33062>`_))
+
+
+Optimizations
+------------------------------------------------
+
+ONNXRuntime includes some transformers-specific transformations to leverage optimized operations in the graph.
+Below are some of the operators which can be enabled to speed up inference through ONNXRuntime (*see note below*):
+
+* Constant folding
+* Attention Layer fusing
+* Skip connection LayerNormalization fusing
+* FastGeLU approximation
+
+Some of the optimizations performed by ONNX runtime can be hardware specific and thus lead to different performances
+if used on another machine with a different hardware configuration than the one used for exporting the model.
+For this reason, when using ``convert_graph_to_onnx.py`` optimizations are not enabled,
+ensuring the model can be easily exported to various hardware.
+Optimizations can then be enabled when loading the model through ONNX runtime for inference.
+
+
+.. note::
+    When quantization is enabled (see below), ``convert_graph_to_onnx.py`` script will enable optimizations on the model
+    because quantization would modify the underlying graph making it impossible for ONNX runtime to do the optimizations
+    afterwards.
+
+.. note::
+    For more information about the optimizations enabled by ONNXRuntime, please have a look at the (`ONNXRuntime Github <https://github.com/microsoft/onnxruntime/tree/master/onnxruntime/python/tools/transformers>`_)
+
+Quantization
+------------------------------------------------
+
+ONNX exporter supports generating a quantized version of the model to allow efficient inference.
+
+Quantization works by converting the memory representation of the parameters in the neural network
+to a compact integer format. By default, weights of a neural network are stored as single-precision float (`float32`)
+which can express a wide-range of floating-point numbers with decent precision.
+These properties are especially interesting at training where you want fine-grained representation.
+
+On the other hand, after the training phase, it has been shown one can greatly reduce the range and the precision of `float32` numbers
+without changing the performances of the neural network.
+
+More technically, `float32` parameters are converted to a type requiring fewer bits to represent each number, thus reducing
+the overall size of the model. Here, we are enabling `float32` mapping to `int8` values (a non-floating, single byte, number representation)
+according to the following formula:
+
+.. math::
+    y_{float32} = scale * x_{int8} - zero\_point
+
+.. note::
+    The quantization process will infer the parameter `scale` and `zero_point` from the neural network parameters
+
+Leveraging tiny-integers has numerous advantages when it comes to inference:
+
+* Storing fewer bits instead of 32 bits for the `float32` reduces the size of the model and makes it load faster.
+* Integer operations execute a magnitude faster on modern hardware
+* Integer operations require less power to do the computations
+
+In order to convert a transformers model to ONNX IR with quantized weights you just need to specify ``--quantize``
+when using ``convert_graph_to_onnx.py``. Also, you can have a look at the ``quantize()`` utility-method in this
+same script file.
+
+Example of quantized BERT model export:
+
+.. code-block:: bash
+
+    python convert_graph_to_onnx.py --framework <pt, tf> --model bert-base-cased --quantize bert-base-cased.onnx
+
+.. note::
+    Quantization support requires ONNX Runtime >= 1.4.0
+
+.. note::
+    When exporting quantized model you will end up with two different ONNX files. The one specified at the end of the
+    above command will contain the original ONNX model storing `float32` weights.
+    The second one, with ``-quantized`` suffix, will hold the quantized parameters.
+
+
+TorchScript
+=======================================
+
+.. note::
+    This is the very beginning of our experiments with TorchScript and we are still exploring its capabilities
+    with variable-input-size models. It is a focus of interest to us and we will deepen our analysis in upcoming
+    releases, with more code examples, a more flexible implementation, and benchmarks comparing python-based codes
+    with compiled TorchScript.
+
+
+According to Pytorch's documentation: "TorchScript is a way to create serializable and optimizable models from PyTorch code".
+Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
+their model to be re-used in other programs, such as efficiency-oriented C++ programs.
+
+We have provided an interface that allows the export of 🤗 Transformers models to TorchScript so that they can
+be reused in a different environment than a Pytorch-based python program. Here we explain how to export and use our models using TorchScript.
+
+Exporting a model requires two things:
+
+* a forward pass with dummy inputs.
+* model instantiation with the ``torchscript`` flag.
+
+These necessities imply several things developers should be careful about. These are detailed below.
+
+
+Implications
+------------------------------------------------
+
+TorchScript flag and tied weights
+------------------------------------------------
+This flag is necessary because most of the language models in this repository have tied weights between their
+``Embedding`` layer and their ``Decoding`` layer. TorchScript does not allow the export of models that have tied weights, therefore
+it is necessary to untie and clone the weights beforehand.
+
+This implies that models instantiated with the ``torchscript`` flag have their ``Embedding`` layer and ``Decoding`` layer
+separate, which means that they should not be trained down the line. Training would de-synchronize the two layers,
+leading to unexpected results.
+
+This is not the case for models that do not have a Language Model head, as those do not have tied weights. These models
+can be safely exported without the ``torchscript`` flag.
+
+Dummy inputs and standard lengths
+------------------------------------------------
+
+The dummy inputs are used to do a model forward pass. While the inputs' values are propagating through the layers,
+Pytorch keeps track of the different operations executed on each tensor. These recorded operations are then used
+to create the "trace" of the model.
+
+The trace is created relatively to the inputs' dimensions. It is therefore constrained by the dimensions of the dummy
+input, and will not work for any other sequence length or batch size. When trying with a different size, an error such
+as:
+
+``The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2``
+
+will be raised. It is therefore recommended to trace the model with a dummy input size at least as large as the largest
+input that will be fed to the model during inference. Padding can be performed to fill the missing values. As the model
+will have been traced with a large input size however, the dimensions of the different matrix will be large as well,
+resulting in more calculations.
+
+It is recommended to be careful of the total number of operations done on each input and to follow performance closely
+when exporting varying sequence-length models.
+
+Using TorchScript in Python
+-------------------------------------------------
+
+Below is an example, showing how to save, load models as well as how to use the trace for inference.
+
+Saving a model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This snippet shows how to use TorchScript to export a ``BertModel``. Here the ``BertModel`` is instantiated
+according to a ``BertConfig`` class and then saved to disk under the filename ``traced_bert.pt``

 .. code-block:: python

-   model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
+    from transformers import BertModel, BertTokenizer, BertConfig
+    import torch

-where
+    enc = BertTokenizer.from_pretrained("bert-base-uncased")

+    # Tokenizing input text
+    text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+    tokenized_text = enc.tokenize(text)

-* ``BERT_CLASS`` is either a tokenizer to load the vocabulary (\ ``BertTokenizer`` or ``OpenAIGPTTokenizer`` classes) or one of the eight BERT or three OpenAI GPT PyTorch model classes (to load the pre-trained weights): ``BertModel``\ , ``BertForMaskedLM``\ , ``BertForNextSentencePrediction``\ , ``BertForPreTraining``\ , ``BertForSequenceClassification``\ , ``BertForTokenClassification``\ , ``BertForMultipleChoice``\ , ``BertForQuestionAnswering``\ , ``OpenAIGPTModel``\ , ``OpenAIGPTLMHeadModel`` or ``OpenAIGPTDoubleHeadsModel``\ , and
-*
-  ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is either:
+    # Masking one of the input tokens
+    masked_index = 8
+    tokenized_text[masked_index] = '[MASK]'
+    indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
+    segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]

+    # Creating a dummy input
+    tokens_tensor = torch.tensor([indexed_tokens])
+    segments_tensors = torch.tensor([segments_ids])
+    dummy_input = [tokens_tensor, segments_tensors]

-  *
-    the shortcut name of a Google AI's or OpenAI's pre-trained model selected in the list:
+    # Initializing the model with the torchscript flag
+    # Flag set to True even though it is not necessary as this model does not have an LM Head.
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, torchscript=True)

+    # Instantiating the model
+    model = BertModel(config)

-    * ``bert-base-uncased``: 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-large-uncased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
-    * ``bert-base-cased``: 12-layer, 768-hidden, 12-heads , 110M parameters
-    * ``bert-large-cased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
-    * ``bert-base-multilingual-uncased``: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-base-multilingual-cased``: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-base-chinese``: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-base-german-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://deepset.ai/german-bert>`__
-    * ``bert-large-uncased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
-    * ``bert-large-cased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
-    * ``bert-large-uncased-whole-word-masking-finetuned-squad``: The ``bert-large-uncased-whole-word-masking`` model finetuned on SQuAD (using the ``run_bert_squad.py`` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
-    * ``bert-base-german-dbmdz-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://github.com/dbmdz/german-bert>`__
-    * ``bert-base-german-dbmdz-uncased``: Trained on (uncased) German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://github.com/dbmdz/german-bert>`__
-    * ``openai-gpt``: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``gpt2``: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
-    * ``gpt2-medium``: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
-    * ``transfo-xl-wt103``: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
+    # The model needs to be in evaluation mode
+    model.eval()

-  *
-    a path or url to a pretrained model archive containing:
+    # If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
+    model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)

+    # Creating the trace
+    traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
+    torch.jit.save(traced_model, "traced_bert.pt")

-    * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
-    * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
+Loading a model
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/transformers/blob/master/transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
-
-*
-  ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
-
-* ``from_tf``\ : should we load the weights from a locally saved TensorFlow checkpoint
-* ``state_dict``\ : an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
-* ``*inputs``\ , `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
-
-``Uncased`` means that the text has been lowercased before WordPiece tokenization, e.g., ``John Smith`` becomes ``john smith``. The Uncased model also strips out any accent markers. ``Cased`` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the `Multilingual README <https://github.com/google-research/bert/blob/master/multilingual.md>`__ or the original TensorFlow repository.
-
-When using an ``uncased model``\ , make sure your tokenizer has ``do_lower_case=True`` (either in its configuration, or passed as an additional parameter).
-
-Examples:
+This snippet shows how to load the ``BertModel`` that was previously saved to disk under the name ``traced_bert.pt``.
+We are re-using the previously initialised ``dummy_input``.

 .. code-block:: python

-   # BERT
-   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_basic_tokenize=True)
-   model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+    loaded_model = torch.jit.load("traced_bert.pt")
+    loaded_model.eval()

-   # OpenAI GPT
-   tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-   model = OpenAIGPTModel.from_pretrained('openai-gpt')
+    all_encoder_layers, pooled_output = loaded_model(*dummy_input)

-   # Transformer-XL
-   tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-   model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
+Using a traced model for inference
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-   # OpenAI GPT-2
-   tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-   model = GPT2Model.from_pretrained('gpt2')
-
-Cache directory
-~~~~~~~~~~~~~~~
-
-``pytorch_pretrained_bert`` save the pretrained weights in a cache directory which is located at (in this order of priority):
-
-
-* ``cache_dir`` optional arguments to the ``from_pretrained()`` method (see above),
-* shell environment variable ``PYTORCH_PRETRAINED_BERT_CACHE``\ ,
-* PyTorch cache home + ``/pytorch_pretrained_bert/``
-  where PyTorch cache home is defined by (in this order):
-
-  * shell environment variable ``ENV_TORCH_HOME``
-  * shell environment variable ``ENV_XDG_CACHE_HOME`` + ``/torch/``\ )
-  * default: ``~/.cache/torch/``
-
-Usually, if you don't set any specific environment variable, ``pytorch_pretrained_bert`` cache will be at ``~/.cache/torch/pytorch_pretrained_bert/``.
-
-You can alsways safely delete ``pytorch_pretrained_bert`` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
-
-Serialization best-practices
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
-There are three types of files you need to save to be able to reload a fine-tuned model:
-
-
-* the model itself which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`__\ ,
-* the configuration file of the model which is saved as a JSON file, and
-* the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
-
-The *default filenames* of these files are as follow:
-
-
-* the model weights file: ``pytorch_model.bin``\ ,
-* the configuration file: ``config.json``\ ,
-* the vocabulary file: ``vocab.txt`` for BERT and Transformer-XL, ``vocab.json`` for GPT/GPT-2 (BPE vocabulary),
-* for GPT/GPT-2 (BPE vocabulary) the additional merges file: ``merges.txt``.
-
-**If you save a model using these *default filenames*\ , you can then re-load the model and tokenizer using the ``from_pretrained()`` method.**
-
-Here is the recommended way of saving the model, configuration and vocabulary to an ``output_dir`` directory and reloading the model and tokenizer afterwards:
+Using the traced model for inference is as simple as using its ``__call__`` dunder method:

 .. code-block:: python

-   from transformers import WEIGHTS_NAME, CONFIG_NAME
-
-   output_dir = "./models/"
-
-   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
-
-   # If we have a distributed model, save only the encapsulated model
-   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
-   model_to_save = model.module if hasattr(model, 'module') else model
-
-   # If we save using the predefined names, we can load using `from_pretrained`
-   output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
-   output_config_file = os.path.join(output_dir, CONFIG_NAME)
-
-   torch.save(model_to_save.state_dict(), output_model_file)
-   model_to_save.config.to_json_file(output_config_file)
-   tokenizer.save_pretrained(output_dir)
-
-   # Step 2: Re-load the saved model and vocabulary
-
-   # Example for a Bert model
-   model = BertForQuestionAnswering.from_pretrained(output_dir)
-   tokenizer = BertTokenizer.from_pretrained(output_dir)  # Add specific options if needed
-   # Example for a GPT model
-   model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
-   tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
-
-Here is another way you can save and reload the model if you want to use specific paths for each type of files:
-
-.. code-block:: python
-
-   output_model_file = "./models/my_own_model_file.bin"
-   output_config_file = "./models/my_own_config_file.bin"
-   output_vocab_file = "./models/my_own_vocab_file.bin"
-
-   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
-
-   # If we have a distributed model, save only the encapsulated model
-   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
-   model_to_save = model.module if hasattr(model, 'module') else model
-
-   torch.save(model_to_save.state_dict(), output_model_file)
-   model_to_save.config.to_json_file(output_config_file)
-   tokenizer.save_vocabulary(output_vocab_file)
-
-   # Step 2: Re-load the saved model and vocabulary
-
-   # We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`.
-   # Here is how to do it in this situation:
-
-   # Example for a Bert model
-   config = BertConfig.from_json_file(output_config_file)
-   model = BertForQuestionAnswering(config)
-   state_dict = torch.load(output_model_file)
-   model.load_state_dict(state_dict)
-   tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case)
-
-   # Example for a GPT model
-   config = OpenAIGPTConfig.from_json_file(output_config_file)
-   model = OpenAIGPTDoubleHeadsModel(config)
-   state_dict = torch.load(output_model_file)
-   model.load_state_dict(state_dict)
-   tokenizer = OpenAIGPTTokenizer(output_vocab_file)
-
+    traced_model(tokens_tensor, segments_tensors)
--- a/docs/source/task_summary.rst
+++ b/docs/source/task_summary.rst
@@ -0,0 +1,856 @@
+Summary of the tasks
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This page shows the most frequent use-cases when using the library. The models available allow for many different
+configurations and a great versatility in use-cases. The most simple ones are presented here, showcasing usage
+for tasks such as question answering, sequence classification, named entity recognition and others.
+
+These examples leverage auto-models, which are classes that will instantiate a model according to a given checkpoint,
+automatically selecting the correct model architecture. Please check the :class:`~transformers.AutoModel` documentation
+for more information.
+Feel free to modify the code to be more specific and adapt it to your specific use-case.
+
+In order for a model to perform well on a task, it must be loaded from a checkpoint corresponding to that task. These
+checkpoints are usually pre-trained on a large corpus of data and fine-tuned on a specific task. This means the
+following:
+
+- Not all models were fine-tuned on all tasks. If you want to fine-tune a model on a specific task, you can leverage
+  one of the `run_$TASK.py` scripts in the
+  `examples <https://github.com/huggingface/transformers/tree/master/examples>`__ directory.
+- Fine-tuned models were fine-tuned on a specific dataset. This dataset may or may not overlap with your use-case
+  and domain. As mentioned previously, you may leverage the
+  `examples <https://github.com/huggingface/transformers/tree/master/examples>`__ scripts to fine-tune your model, or you
+  may create your own training script.
+
+In order to do an inference on a task, several mechanisms are made available by the library:
+
+- Pipelines: very easy-to-use abstractions, which require as little as two lines of code.
+- Direct model use: Less abstractions, but more flexibility and power via a direct access to a tokenizer (PyTorch/TensorFlow) and full inference capacity.
+
+Both approaches are showcased here.
+
+.. note::
+
+    All tasks presented here leverage pre-trained checkpoints that were fine-tuned on specific tasks. Loading a
+    checkpoint that was not fine-tuned on a specific task would load only the base transformer layers and not the
+    additional head that is used for the task, initializing the weights of that head randomly.
+
+    This would produce random output.
+
+Sequence Classification
+--------------------------
+
+Sequence classification is the task of classifying sequences according to a given number of classes. An example
+of sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune
+a model on a GLUE sequence classification task, you may leverage the
+`run_glue.py <https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_glue.py>`__ and
+`run_pl_glue.py <https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_pl_glue.py>`__ or
+`run_tf_glue.py <https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_tf_glue.py>`__ scripts.
+
+Here is an example of using pipelines to do sentiment analysis: identifying if a sequence is positive or negative.
+It leverages a fine-tuned model on sst2, which is a GLUE task.
+
+This returns a label ("POSITIVE" or "NEGATIVE") alongside a score, as follows:
+
+.. code-block::
+
+    >>> from transformers import pipeline
+
+    >>> nlp = pipeline("sentiment-analysis")
+
+    >>> result = nlp("I hate you")[0]
+    >>> print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+    label: NEGATIVE, with score: 0.9991
+
+    >>> result = nlp("I love you")[0]
+    >>> print(f"label: {result['label']}, with score: {round(result['score'], 4)}")
+    label: POSITIVE, with score: 0.9999
+
+
+Here is an example of doing a sequence classification using a model to determine if two sequences are paraphrases
+of each other. The process is the following:
+
+1. Instantiate a tokenizer and a model from the checkpoint name. The model is
+   identified as a BERT model and loads it with the weights stored in the
+   checkpoint.
+2. Build a sequence from the two sentences, with the correct model-specific
+   separators token type ids and attention masks
+   (:func:`~transformers.PreTrainedTokenizer.encode` and
+   :func:`~transformers.PreTrainedTokenizer.__call__` take care of this).
+3. Pass this sequence through the model so that it is classified in one of the
+   two available classes: 0 (not a paraphrase) and 1 (is a paraphrase).
+4. Compute the softmax of the result to get probabilities over the classes.
+5. Print the results.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoTokenizer, AutoModelForSequenceClassification
+    >>> import torch
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
+    >>> model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
+
+    >>> classes = ["not paraphrase", "is paraphrase"]
+
+    >>> sequence_0 = "The company HuggingFace is based in New York City"
+    >>> sequence_1 = "Apples are especially bad for your health"
+    >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+
+    >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="pt")
+    >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="pt")
+
+    >>> paraphrase_classification_logits = model(**paraphrase).logits
+    >>> not_paraphrase_classification_logits = model(**not_paraphrase).logits
+
+    >>> paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
+    >>> not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
+
+    >>> # Should be paraphrase
+    >>> for i in range(len(classes)):
+    ...     print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
+    not paraphrase: 10%
+    is paraphrase: 90%
+
+    >>> # Should not be paraphrase
+    >>> for i in range(len(classes)):
+    ...     print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")
+    not paraphrase: 94%
+    is paraphrase: 6%
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
+    >>> model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
+
+    >>> classes = ["not paraphrase", "is paraphrase"]
+
+    >>> sequence_0 = "The company HuggingFace is based in New York City"
+    >>> sequence_1 = "Apples are especially bad for your health"
+    >>> sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
+
+    >>> paraphrase = tokenizer(sequence_0, sequence_2, return_tensors="tf")
+    >>> not_paraphrase = tokenizer(sequence_0, sequence_1, return_tensors="tf")
+
+    >>> paraphrase_classification_logits = model(paraphrase)[0]
+    >>> not_paraphrase_classification_logits = model(not_paraphrase)[0]
+
+    >>> paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
+    >>> not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
+
+    >>> # Should be paraphrase
+    >>> for i in range(len(classes)):
+    ...     print(f"{classes[i]}: {int(round(paraphrase_results[i] * 100))}%")
+    not paraphrase: 10%
+    is paraphrase: 90%
+
+    >>> # Should not be paraphrase
+    >>> for i in range(len(classes)):
+    ...     print(f"{classes[i]}: {int(round(not_paraphrase_results[i] * 100))}%")
+    not paraphrase: 94%
+    is paraphrase: 6%
+
+Extractive Question Answering
+----------------------------------------------------
+
+Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
+question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
+a model on a SQuAD task, you may leverage the
+`run_squad.py <https://github.com/huggingface/transformers/tree/master/examples/question-answering/run_squad.py>`__ and
+`run_tf_squad.py <https://github.com/huggingface/transformers/tree/master/examples/question-answering/run_tf_squad.py>`__ scripts.
+
+
+Here is an example of using pipelines to do question answering: extracting an answer from a text given a question.
+It leverages a fine-tuned model on SQuAD.
+
+.. code-block::
+
+    >>> from transformers import pipeline
+
+    >>> nlp = pipeline("question-answering")
+
+    >>> context = r"""
+    ... Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
+    ... question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
+    ... a model on a SQuAD task, you may leverage the examples/question-answering/run_squad.py script.
+    ... """
+
+This returns an answer extracted from the text, a confidence score, alongside "start" and "end" values, which
+are the positions of the extracted answer in the text.
+
+.. code-block::
+
+    >>> result = nlp(question="What is extractive question answering?", context=context)
+    >>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
+    Answer: 'the task of extracting an answer from a text given a question.', score: 0.6226, start: 34, end: 96
+
+    >>> result = nlp(question="What is a good example of a question answering dataset?", context=context)
+    >>> print(f"Answer: '{result['answer']}', score: {round(result['score'], 4)}, start: {result['start']}, end: {result['end']}")
+    Answer: 'SQuAD dataset,', score: 0.5053, start: 147, end: 161
+
+
+Here is an example of question answering using a model and a tokenizer. The process is the following:
+
+1. Instantiate a tokenizer and a model from the checkpoint name. The model is
+   identified as a BERT model and loads it with the weights stored in the
+   checkpoint.
+2. Define a text and a few questions.
+3. Iterate over the questions and build a sequence from the text and the current
+   question, with the correct model-specific separators token type ids and
+   attention masks.
+4. Pass this sequence through the model. This outputs a range of scores across
+   the entire sequence tokens (question and text), for both the start and end
+   positions.
+5. Compute the softmax of the result to get probabilities over the tokens.
+6. Fetch the tokens from the identified start and stop values, convert those tokens to a string.
+7. Print the results.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoTokenizer, AutoModelForQuestionAnswering
+    >>> import torch
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+    >>> model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+
+    >>> text = r"""
+    ... 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
+    ... architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
+    ... Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+    ... TensorFlow 2.0 and PyTorch.
+    ... """
+
+    >>> questions = [
+    ...     "How many pretrained models are available in 🤗 Transformers?",
+    ...     "What does 🤗 Transformers provide?",
+    ...     "🤗 Transformers provides interoperability between which frameworks?",
+    ... ]
+
+    >>> for question in questions:
+    ...     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="pt")
+    ...     input_ids = inputs["input_ids"].tolist()[0]
+    ...
+    ...     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+    ...     answer_start_scores, answer_end_scores = model(**inputs)
+    ...
+    ...     answer_start = torch.argmax(
+    ...         answer_start_scores
+    ...     )  # Get the most likely beginning of answer with the argmax of the score
+    ...     answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
+    ...
+    ...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+    ...
+    ...     print(f"Question: {question}")
+    ...     print(f"Answer: {answer}")
+    Question: How many pretrained models are available in 🤗 Transformers?
+    Answer: over 32 +
+    Question: What does 🤗 Transformers provide?
+    Answer: general - purpose architectures
+    Question: 🤗 Transformers provides interoperability between which frameworks?
+    Answer: tensorflow 2 . 0 and pytorch
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+    >>> model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
+
+    >>> text = r"""
+    ... 🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
+    ... architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
+    ... Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
+    ... TensorFlow 2.0 and PyTorch.
+    ... """
+
+    >>> questions = [
+    ...     "How many pretrained models are available in 🤗 Transformers?",
+    ...     "What does 🤗 Transformers provide?",
+    ...     "🤗 Transformers provides interoperability between which frameworks?",
+    ... ]
+
+    >>> for question in questions:
+    ...     inputs = tokenizer(question, text, add_special_tokens=True, return_tensors="tf")
+    ...     input_ids = inputs["input_ids"].numpy()[0]
+    ...
+    ...     text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
+    ...     answer_start_scores, answer_end_scores = model(inputs)
+    ...
+    ...     answer_start = tf.argmax(
+    ...         answer_start_scores, axis=1
+    ...     ).numpy()[0]  # Get the most likely beginning of answer with the argmax of the score
+    ...     answer_end = (
+    ...         tf.argmax(answer_end_scores, axis=1) + 1
+    ...     ).numpy()[0]  # Get the most likely end of answer with the argmax of the score
+    ...     answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
+    ...
+    ...     print(f"Question: {question}")
+    ...     print(f"Answer: {answer}")
+    Question: How many pretrained models are available in 🤗 Transformers?
+    Answer: over 32 +
+    Question: What does 🤗 Transformers provide?
+    Answer: general - purpose architectures
+    Question: 🤗 Transformers provides interoperability between which frameworks?
+    Answer: tensorflow 2 . 0 and pytorch
+
+
+
+Language Modeling
+----------------------------------------------------
+
+Language modeling is the task of fitting a model to a corpus, which can be domain specific. All popular transformer-based
+models are trained using a variant of language modeling, e.g. BERT with masked language modeling, GPT-2 with
+causal language modeling.
+
+Language modeling can be useful outside of pre-training as well, for example to shift the model distribution to be
+domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset
+or on scientific papers e.g. `LysandreJik/arxiv-nlp <https://huggingface.co/lysandre/arxiv-nlp>`__.
+
+Masked Language Modeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Masked language modeling is the task of masking tokens in a sequence with a masking token, and prompting the model to
+fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the
+right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis
+for downstream tasks, requiring bi-directional context such as SQuAD (question answering,
+see `Lewis, Lui, Goyal et al. <https://arxiv.org/abs/1910.13461>`__, part 4.2).
+
+Here is an example of using pipelines to replace a mask from a sequence:
+
+.. code-block::
+
+    >>> from transformers import pipeline
+
+    >>> nlp = pipeline("fill-mask")
+
+This outputs the sequences with the mask filled, the confidence score, and the token id in the tokenizer
+vocabulary:
+
+.. code-block::
+
+    >>> from pprint import pprint
+    >>> pprint(nlp(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks."))
+    [{'score': 0.1792745739221573,
+      'sequence': '<s>HuggingFace is creating a tool that the community uses to '
+                  'solve NLP tasks.</s>',
+      'token': 3944,
+      'token_str': 'Ġtool'},
+     {'score': 0.11349421739578247,
+      'sequence': '<s>HuggingFace is creating a framework that the community uses '
+                  'to solve NLP tasks.</s>',
+      'token': 7208,
+      'token_str': 'Ġframework'},
+     {'score': 0.05243554711341858,
+      'sequence': '<s>HuggingFace is creating a library that the community uses to '
+                  'solve NLP tasks.</s>',
+      'token': 5560,
+      'token_str': 'Ġlibrary'},
+     {'score': 0.03493533283472061,
+      'sequence': '<s>HuggingFace is creating a database that the community uses '
+                  'to solve NLP tasks.</s>',
+      'token': 8503,
+      'token_str': 'Ġdatabase'},
+     {'score': 0.02860250137746334,
+      'sequence': '<s>HuggingFace is creating a prototype that the community uses '
+                  'to solve NLP tasks.</s>',
+      'token': 17715,
+      'token_str': 'Ġprototype'}]
+
+Here is an example of doing masked language modeling using a model and a tokenizer. The process is the following:
+
+1. Instantiate a tokenizer and a model from the checkpoint name. The model is
+   identified as a DistilBERT model and loads it with the weights stored in the
+   checkpoint.
+2. Define a sequence with a masked token, placing the :obj:`tokenizer.mask_token` instead of a word.
+3. Encode that sequence into a list of IDs and find the position of the masked token in that list.
+4. Retrieve the predictions at the index of the mask token: this tensor has the
+   same size as the vocabulary, and the values are the scores attributed to each
+   token. The model gives higher score to tokens it deems probable in that
+   context.
+5. Retrieve the top 5 tokens using the PyTorch :obj:`topk` or TensorFlow :obj:`top_k` methods.
+6. Replace the mask token by the tokens and print the results
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+    >>> import torch
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    >>> model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+
+    >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+
+    >>> input = tokenizer.encode(sequence, return_tensors="pt")
+    >>> mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
+
+    >>> token_logits = model(input).logits
+    >>> mask_token_logits = token_logits[0, mask_token_index, :]
+
+    >>> top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
+    >>> model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")
+
+    >>> sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
+
+    >>> input = tokenizer.encode(sequence, return_tensors="tf")
+    >>> mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
+
+    >>> token_logits = model(input)[0]
+    >>> mask_token_logits = token_logits[0, mask_token_index, :]
+
+    >>> top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
+
+
+This prints five sequences, with the top 5 tokens predicted by the model:
+
+.. code-block::
+
+    >>> for token in top_5_tokens:
+    ...     print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
+    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
+
+
+Causal Language Modeling
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the
+model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
+for generation tasks.
+
+Usually, the next token is predicted by sampling from the logits of the last hidden state the model produces from the input sequence.
+
+Here is an example of using the tokenizer and model and leveraging the :func:`~transformers.PreTrainedModel.top_k_top_p_filtering` method to sample the next token following an input sequence of tokens.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer, top_k_top_p_filtering
+    >>> import torch
+    >>> from torch.nn import functional as F
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> model = AutoModelWithLMHead.from_pretrained("gpt2")
+
+    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and "
+
+    >>> input_ids = tokenizer.encode(sequence, return_tensors="pt")
+
+    >>> # get logits of last hidden state
+    >>> next_token_logits = model(input_ids).logits[:, -1, :]
+
+    >>> # filter
+    >>> filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
+
+    >>> # sample
+    >>> probs = F.softmax(filtered_next_token_logits, dim=-1)
+    >>> next_token = torch.multinomial(probs, num_samples=1)
+
+    >>> generated = torch.cat([input_ids, next_token], dim=-1)
+
+    >>> resulting_string = tokenizer.decode(generated.tolist()[0])
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer, tf_top_k_top_p_filtering
+    >>> import tensorflow as tf
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("gpt2")
+    >>> model = TFAutoModelWithLMHead.from_pretrained("gpt2")
+
+    >>> sequence = f"Hugging Face is based in DUMBO, New York City, and "
+
+    >>> input_ids = tokenizer.encode(sequence, return_tensors="tf")
+
+    >>> # get logits of last hidden state
+    >>> next_token_logits = model(input_ids)[0][:, -1, :]
+
+    >>> # filter
+    >>> filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
+
+    >>> # sample
+    >>> next_token = tf.random.categorical(filtered_next_token_logits, dtype=tf.int32, num_samples=1)
+
+    >>> generated = tf.concat([input_ids, next_token], axis=1)
+
+    >>> resulting_string = tokenizer.decode(generated.numpy().tolist()[0])
+
+
+This outputs a (hopefully) coherent next token following the original sequence, which in our case is the word *has*:
+
+.. code-block::
+
+    >>> print(resulting_string)
+    Hugging Face is based in DUMBO, New York City, and has
+
+In the next section, we show how this functionality is leveraged in :func:`~transformers.PreTrainedModel.generate` to generate multiple tokens up to a user-defined length.
+
+Text Generation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In text generation (*a.k.a* *open-ended text generation*) the goal is to create a coherent portion of text that is a continuation from the given context. The following example shows how *GPT-2* can be used in pipelines to generate text. As a default all models apply *Top-K* sampling when used in pipelines, as configured in their respective configurations (see `gpt-2 config <https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json>`__ for example).
+
+.. code-block::
+
+    >>> from transformers import pipeline
+
+    >>> text_generator = pipeline("text-generation")
+    >>> print(text_generator("As far as I am concerned, I will", max_length=50, do_sample=False))
+    [{'generated_text': 'As far as I am concerned, I will be the first to admit that I am not a fan of the idea of a "free market." I think that the idea of a free market is a bit of a stretch. I think that the idea'}]
+
+
+
+Here, the model generates a random text with a total maximal length of *50* tokens from context *"As far as I am concerned, I will"*.
+The default arguments of ``PreTrainedModel.generate()`` can be directly overriden in the pipeline, as is shown above for the argument ``max_length``.
+
+Here is an example of text generation using ``XLNet`` and its tokenzier.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    >>> model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
+
+    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
+    >>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
+    ... (except for Alexei and Maria) are discovered.
+    ... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+    ... remainder of the story. 1883 Western Siberia,
+    ... a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+    ... Rasputin has a vision and denounces one of the men as a horse thief. Although his
+    ... father initially slaps him for making such an accusation, Rasputin watches as the
+    ... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    ... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
+
+    >>> prompt = "Today the weather is really nice and I am planning on "
+    >>> inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")
+
+    >>> prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
+    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
+
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
+
+    >>> model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased")
+    >>> tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
+
+    >>> # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
+    >>> PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
+    ... (except for Alexei and Maria) are discovered.
+    ... The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+    ... remainder of the story. 1883 Western Siberia,
+    ... a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+    ... Rasputin has a vision and denounces one of the men as a horse thief. Although his
+    ... father initially slaps him for making such an accusation, Rasputin watches as the
+    ... man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+    ... the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+    ... with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
+
+    >>> prompt = "Today the weather is really nice and I am planning on "
+    >>> inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")
+
+    >>> prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
+    >>> outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
+    >>> generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
+
+.. code-block::
+
+    >>> print(generated)
+    Today the weather is really nice and I am planning on anning on taking a nice...... of a great time!<eop>...............
+
+Text generation is currently possible with *GPT-2*, *OpenAi-GPT*, *CTRL*, *XLNet*, *Transfo-XL* and *Reformer* in PyTorch and for most models in Tensorflow as well. As can be seen in the example above *XLNet* and *Transfo-XL* often need to be padded to work well.
+GPT-2 is usually a good choice for *open-ended text generation* because it was trained on millions of webpages with a causal language modeling objective.
+
+For more information on how to apply different decoding strategies for text generation, please also refer to our text generation blog post `here <https://huggingface.co/blog/how-to-generate>`__.
+
+
+Named Entity Recognition
+----------------------------------------------------
+
+Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example, identifying a
+token as a person, an organisation or a location.
+An example of a named entity recognition dataset is the CoNLL-2003 dataset, which is entirely based on that task.
+If you would like to fine-tune a model on an NER task, you may leverage the
+`run_ner.py <https://github.com/huggingface/transformers/tree/master/examples/token-classification/run_ner.py>`__ (PyTorch),
+`run_pl_ner.py <https://github.com/huggingface/transformers/tree/master/examples/token-classification/run_pl_ner.py>`__ (leveraging pytorch-lightning) or the
+`run_tf_ner.py <https://github.com/huggingface/transformers/tree/master/examples/token-classification/run_tf_ner.py>`__ (TensorFlow) scripts.
+
+Here is an example of using pipelines to do named entity recognition, specifically, trying to identify tokens as belonging to one
+of 9 classes:
+
+- O, Outside of a named entity
+- B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity
+- I-MIS, Miscellaneous entity
+- B-PER, Beginning of a person's name right after another person's name
+- I-PER, Person's name
+- B-ORG, Beginning of an organisation right after another organisation
+- I-ORG, Organisation
+- B-LOC, Beginning of a location right after another location
+- I-LOC, Location
+
+It leverages a fine-tuned model on CoNLL-2003, fine-tuned by `@stefan-it <https://github.com/stefan-it>`__ from
+`dbmdz <https://github.com/dbmdz>`__.
+
+.. code-block::
+
+    >>> from transformers import pipeline
+
+    >>> nlp = pipeline("ner")
+
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very"
+    ...            "close to the Manhattan Bridge which is visible from the window."
+
+
+This outputs a list of all words that have been identified as one of the entities from the 9 classes defined above. Here are the
+expected results:
+
+.. code-block::
+
+    >>> print(nlp(sequence))
+    [
+        {'word': 'Hu', 'score': 0.9995632767677307, 'entity': 'I-ORG'},
+        {'word': '##gging', 'score': 0.9915938973426819, 'entity': 'I-ORG'},
+        {'word': 'Face', 'score': 0.9982671737670898, 'entity': 'I-ORG'},
+        {'word': 'Inc', 'score': 0.9994403719902039, 'entity': 'I-ORG'},
+        {'word': 'New', 'score': 0.9994346499443054, 'entity': 'I-LOC'},
+        {'word': 'York', 'score': 0.9993270635604858, 'entity': 'I-LOC'},
+        {'word': 'City', 'score': 0.9993864893913269, 'entity': 'I-LOC'},
+        {'word': 'D', 'score': 0.9825621843338013, 'entity': 'I-LOC'},
+        {'word': '##UM', 'score': 0.936983048915863, 'entity': 'I-LOC'},
+        {'word': '##BO', 'score': 0.8987102508544922, 'entity': 'I-LOC'},
+        {'word': 'Manhattan', 'score': 0.9758241176605225, 'entity': 'I-LOC'},
+        {'word': 'Bridge', 'score': 0.990249514579773, 'entity': 'I-LOC'}
+    ]
+
+Note, how the tokens of the sequence "Hugging Face" have been identified as an organisation, and "New York City", "DUMBO" and
+"Manhattan Bridge" have been identified as locations.
+
+Here is an example of doing named entity recognition, using a model and a tokenizer. The process is the following:
+
+1. Instantiate a tokenizer and a model from the checkpoint name. The model is
+   identified as a BERT model and loads it with the weights stored in the
+   checkpoint.
+2. Define the label list with which the model was trained on.
+3. Define a sequence with known entities, such as "Hugging Face" as an organisation and "New York City" as a location.
+4. Split words into tokens so that they can be mapped to predictions. We use a
+   small hack by, first, completely encoding and decoding the sequence, so that
+   we're left with a string that contains the special tokens.
+5. Encode that sequence into IDs (special tokens are added automatically).
+6. Retrieve the predictions by passing the input to the model and getting the
+   first output. This results in a distribution over the 9 possible classes for
+   each token. We take the argmax to retrieve the most likely class for each
+   token.
+7. Zip together each token with its prediction and print it.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelForTokenClassification, AutoTokenizer
+    >>> import torch
+
+    >>> model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+    >>> label_list = [
+    ...     "O",       # Outside of a named entity
+    ...     "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
+    ...     "I-MISC",  # Miscellaneous entity
+    ...     "B-PER",   # Beginning of a person's name right after another person's name
+    ...     "I-PER",   # Person's name
+    ...     "B-ORG",   # Beginning of an organisation right after another organisation
+    ...     "I-ORG",   # Organisation
+    ...     "B-LOC",   # Beginning of a location right after another location
+    ...     "I-LOC"    # Location
+    ... ]
+
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+    ...            "close to the Manhattan Bridge."
+
+    >>> # Bit of a hack to get the tokens with the special tokens
+    >>> tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
+    >>> inputs = tokenizer.encode(sequence, return_tensors="pt")
+
+    >>> outputs = model(inputs).logits
+    >>> predictions = torch.argmax(outputs, dim=2)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelForTokenClassification, AutoTokenizer
+    >>> import tensorflow as tf
+
+    >>> model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
+    >>> tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+
+    >>> label_list = [
+    ...     "O",       # Outside of a named entity
+    ...     "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
+    ...     "I-MISC",  # Miscellaneous entity
+    ...     "B-PER",   # Beginning of a person's name right after another person's name
+    ...     "I-PER",   # Person's name
+    ...     "B-ORG",   # Beginning of an organisation right after another organisation
+    ...     "I-ORG",   # Organisation
+    ...     "B-LOC",   # Beginning of a location right after another location
+    ...     "I-LOC"    # Location
+    ... ]
+
+    >>> sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
+    ...            "close to the Manhattan Bridge."
+
+    >>> # Bit of a hack to get the tokens with the special tokens
+    >>> tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
+    >>> inputs = tokenizer.encode(sequence, return_tensors="tf")
+
+    >>> outputs = model(inputs)[0]
+    >>> predictions = tf.argmax(outputs, axis=2)
+
+
+This outputs a list of each token mapped to its corresponding prediction. Differently from the pipeline, here every token has
+a prediction as we didn't remove the "0"th class, which means that no particular entity was found on that token. The
+following array should be the output:
+
+.. code-block::
+
+    >>> print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())])
+    [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]
+
+Summarization
+----------------------------------------------------
+
+Summarization is the task of summarizing a document or an article into a shorter text.
+
+An example of a summarization dataset is the CNN / Daily Mail dataset, which consists of long news articles and was created for the task of summarization.
+If you would like to fine-tune a model on a summarization task, various approaches are described in this
+`document <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/README.md>`__.
+
+Here is an example of using the pipelines to do summarization. It leverages a Bart model that was fine-tuned on the CNN / Daily Mail data set.
+
+.. code-block::
+
+    >>> from transformers import pipeline
+
+    >>> summarizer = pipeline("summarization")
+
+    >>> ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York.
+    ... A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband.
+    ... Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other.
+    ... In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage.
+    ... Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the
+    ... 2010 marriage license application, according to court documents.
+    ... Prosecutors said the marriages were part of an immigration scam.
+    ... On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further.
+    ... After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective
+    ... Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002.
+    ... All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say.
+    ... Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages.
+    ... Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted.
+    ... The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s
+    ... Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali.
+    ... Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force.
+    ... If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.
+    ... """
+
+Because the summarization pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments
+of ``PretrainedModel.generate()`` directly in the pipeline for ``max_length`` and ``min_length`` as shown below.
+This outputs the following summary:
+
+.. code-block::
+
+    >>> print(summarizer(ARTICLE, max_length=130, min_length=30, do_sample=False))
+    [{'summary_text': 'Liana Barrientos, 39, is charged with two counts of "offering a false instrument for filing in the first degree" In total, she has been married 10 times, with nine of her marriages occurring between 1999 and 2002. She is believed to still be married to four men.'}]
+
+Here is an example of doing summarization using a model and a tokenizer. The process is the following:
+
+1. Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``.
+2. Define the article that should be summarized.
+3. Add the T5 specific prefix "summarize: ".
+4. Use the ``PretrainedModel.generate()`` method to generate the summary.
+
+In this example we use Google`s T5 model. Even though it was pre-trained only on a multi-task mixed dataset (including CNN / Daily Mail), it yields very good results.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    >>> model = AutoModelWithLMHead.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
+    >>> inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="pt", max_length=512)
+    >>> outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
+
+    >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    >>> # T5 uses a max_length of 512 so we cut the article to 512 tokens.
+    >>> inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
+    >>> outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
+
+Translation
+----------------------------------------------------
+
+Translation is the task of translating a text from one language to another.
+
+An example of a translation dataset is the WMT English to German dataset, which has sentences in English as the input data
+and the corresponding sentences in German as the target data.
+If you would like to fine-tune a model on a translation task, various approaches are described in this
+`document <https://github.com/huggingface/transformers/blob/master/examples/seq2seq/README.md>`__.
+
+Here is an example of using the pipelines to do translation.
+It leverages a T5 model that was only pre-trained on a multi-task mixture dataset (including WMT), yet, yielding impressive
+translation results.
+
+.. code-block::
+
+    >>> from transformers import pipeline
+
+    >>> translator = pipeline("translation_en_to_de")
+    >>> print(translator("Hugging Face is a technology company based in New York and Paris", max_length=40))
+    [{'translation_text': 'Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.'}]
+
+Because the translation pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments
+of ``PretrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` above.
+
+Here is an example of doing translation using a model and a tokenizer. The process is the following:
+
+1. Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``.
+2. Define the article that should be summarizaed.
+3. Add the T5 specific prefix "translate English to German: "
+4. Use the ``PretrainedModel.generate()`` method to perform the translation.
+
+.. code-block::
+
+    >>> ## PYTORCH CODE
+    >>> from transformers import AutoModelWithLMHead, AutoTokenizer
+
+    >>> model = AutoModelWithLMHead.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="pt")
+    >>> outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
+    >>> ## TENSORFLOW CODE
+    >>> from transformers import TFAutoModelWithLMHead, AutoTokenizer
+
+    >>> model = TFAutoModelWithLMHead.from_pretrained("t5-base")
+    >>> tokenizer = AutoTokenizer.from_pretrained("t5-base")
+
+    >>> inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="tf")
+    >>> outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
+
+As with the pipeline example, we get the same translation:
+
+.. code-block::
+
+    >>> print(tokenizer.decode(outputs[0]))
+    Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.
--- a/docs/source/testing.rst
+++ b/docs/source/testing.rst
@@ -0,0 +1,943 @@
+Testing
+==========
+
+
+Let's take a look at how 🤗 Transformer models are tested and how you can write new tests and improve the existing ones.
+
+There are 2 test suites in the repository:
+
+1. ``tests`` -- tests for the general API
+2. ``examples`` -- tests primarily for various applications that aren't part of the API
+
+How transformers are tested
+---------------------------
+
+1. Once a PR is submitted it gets tested with 9 CircleCi jobs. Every new commit to that PR gets retested. These jobs are defined in this `config file <https://github.com/huggingface/transformers/blob/master/.circleci/config.yml>`__, so that if needed you can reproduce the same environment on your machine.
+   
+   These CI jobs don't run ``@slow`` tests.
+   
+2. There are 3 jobs run by `github actions <https://github.com/huggingface/transformers/actions>`__:
+
+   * `torch hub integration <https://github.com/huggingface/transformers/blob/master/.github/workflows/github-torch-hub.yml>`__:  checks whether torch hub integration works.
+
+   * `self-hosted (push) <https://github.com/huggingface/transformers/blob/master/.github/workflows/self-push.yml>`__: runs fast tests on GPU only on commits on ``master``. It only runs if a commit on ``master`` has updated the code in one of the following folders: ``src``, ``tests``, ``.github`` (to prevent running on added model cards, notebooks, etc.)
+     
+   * `self-hosted runner <https://github.com/huggingface/transformers/blob/master/.github/workflows/self-scheduled.yml>`__: runs slow tests on ``tests`` and ``examples``:
+
+   .. code-block:: bash
+
+    RUN_SLOW=1 USE_CUDA=1 pytest tests/
+    RUN_SLOW=1 USE_CUDA=1 pytest examples/
+
+   The results can be observed `here <https://github.com/huggingface/transformers/actions>`__.
+
+
+
+Running tests
+-------------
+
+
+
+
+
+Choosing which tests to run
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This document goes into many details of how tests can be run. If after reading everything, you need even more details you will find them `here <https://docs.pytest.org/en/latest/usage.html>`__.
+
+Here are some most useful ways of running tests.
+
+Run all:
+
+.. code-block:: console
+
+   pytest
+
+or:
+
+.. code-block:: bash
+
+   make test
+
+Note that the latter is defined as:
+
+.. code-block:: bash
+
+   python -m pytest -n auto --dist=loadfile -s -v ./tests/
+
+which tells pytest to:
+
+* run as many test processes as they are CPU cores (which could be too many if you don't have a ton of RAM!)
+* ensure that all tests from the same file will be run by the same test process
+* do not capture output
+* run in verbose mode
+
+
+
+Getting the list of all tests
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+All tests of the test suite:
+
+.. code-block:: bash
+
+   pytest --collect-only -q
+
+All tests of a given test file:
+
+.. code-block:: bash
+
+   pytest tests/test_optimization.py --collect-only -q
+
+
+   
+Run a specific test module
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To run an individual test module:
+
+.. code-block:: bash
+
+   pytest tests/test_logging.py
+   
+
+Run specific tests
+~~~~~~~~~~~~~~~~~~
+
+Since unittest is used inside most of the tests, to run specific subtests you need to know the name of the unittest class containing those tests. For example, it could be:
+
+.. code-block:: bash
+
+   pytest tests/test_optimization.py::OptimizationTest::test_adam_w
+
+Here:
+
+* ``tests/test_optimization.py`` - the file with tests
+* ``OptimizationTest`` - the name of the class
+* ``test_adam_w`` - the name of the specific test function
+
+If the file contains multiple classes, you can choose to run only tests of a given class. For example:
+
+.. code-block:: bash
+
+   pytest tests/test_optimization.py::OptimizationTest
+
+
+will run all the tests inside that class.
+
+As mentioned earlier you can see what tests are contained inside the ``OptimizationTest`` class by running:
+
+.. code-block:: bash
+
+   pytest tests/test_optimization.py::OptimizationTest --collect-only -q
+
+  
+You can run tests by keyword expressions.
+
+To run only tests whose name contains ``adam``:
+
+.. code-block:: bash
+
+   pytest -k adam tests/test_optimization.py
+
+To run all tests except those whose name contains ``adam``:
+
+.. code-block:: bash
+
+   pytest -k "not adam" tests/test_optimization.py
+
+And you can combine the two patterns in one:
+
+
+.. code-block:: bash
+
+   pytest -k "ada and not adam" tests/test_optimization.py
+
+
+
+Run only modified tests
+~~~~~~~~~~~~~~~~~~~~~~~
+
+You can run the tests related to the unstaged files or the current branch (according to Git) by using `pytest-picked <https://github.com/anapaulagomes/pytest-picked>`__. This is a great way of quickly testing your changes didn't break anything, since it won't run the tests related to files you didn't touch.
+
+.. code-block:: bash
+
+    pip install pytest-picked
+
+.. code-block:: bash
+
+    pytest --picked
+
+All tests will be run from files and folders which are modified, but not
+yet committed.
+
+Automatically rerun failed tests on source modification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+`pytest-xdist <https://github.com/pytest-dev/pytest-xdist>`__ provides a
+very useful feature of detecting all failed tests, and then waiting for
+you to modify files and continuously re-rerun those failing tests until
+they pass while you fix them. So that you don't need to re start pytest
+after you made the fix. This is repeated until all tests pass after
+which again a full run is performed.
+
+.. code-block:: bash
+
+    pip install pytest-xdist
+
+To enter the mode: ``pytest -f`` or ``pytest --looponfail``
+
+File changes are detected by looking at ``looponfailroots`` root
+directories and all of their contents (recursively). If the default for
+this value does not work for you, you can change it in your project by
+setting a configuration option in ``setup.cfg``:
+
+.. code-block:: ini
+
+    [tool:pytest]
+    looponfailroots = transformers tests
+
+or ``pytest.ini``/``tox.ini`` files:
+
+.. code-block:: ini
+
+    [pytest]
+    looponfailroots = transformers tests
+
+This would lead to only looking for file changes in the respective
+directories, specified relatively to the ini-file’s directory.
+
+`pytest-watch <https://github.com/joeyespo/pytest-watch>`__ is an
+alternative implementation of this functionality.
+
+
+Skip a test module
+~~~~~~~~~~~~~~~~~~
+
+If you want to run all test modules, except a few you can exclude them by giving an explicit list of tests to run. For example, to run all except ``test_modeling_*.py`` tests:
+
+.. code-block:: bash
+
+   pytest `ls -1 tests/*py | grep -v test_modeling`
+
+
+Clearing state
+~~~~~~~~~~~~~~
+
+CI builds and when isolation is important (against speed), cache should
+be cleared:
+
+.. code-block:: bash
+
+    pytest --cache-clear tests
+
+Running tests in parallel
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+As mentioned earlier ``make test`` runs tests in parallel via ``pytest-xdist`` plugin (``-n X`` argument, e.g. ``-n 2`` to run 2 parallel jobs).
+
+``pytest-xdist``'s ``--dist=`` option allows one to control how the tests are grouped. ``--dist=loadfile`` puts the tests located in one file onto the same process.
+
+Since the order of executed tests is different and unpredictable, if
+running the test suite with ``pytest-xdist`` produces failures (meaning
+we have some undetected coupled tests), use
+`pytest-replay <https://github.com/ESSS/pytest-replay>`__ to replay the
+tests in the same order, which should help with then somehow reducing
+that failing sequence to a minimum.
+
+Test order and repetition
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+It's good to repeat the tests several times, in sequence, randomly, or
+in sets, to detect any potential inter-dependency and state-related bugs
+(tear down). And the straightforward multiple repetition is just good to
+detect some problems that get uncovered by randomness of DL.
+
+
+Repeat tests
+^^^^^^^^^^^^
+
+* `pytest-flakefinder <https://github.com/dropbox/pytest-flakefinder>`__:
+
+.. code-block:: bash
+
+   pip install pytest-flakefinder
+
+And then run every test multiple times (50 by default):
+
+.. code-block:: bash
+
+   pytest --flake-finder --flake-runs=5 tests/test_failing_test.py
+   
+.. note::
+   This plugin doesn't work with ``-n`` flag from ``pytest-xdist``.
+   
+.. note::
+   There is another plugin ``pytest-repeat``, but it doesn't work with ``unittest``.
+
+
+Run tests in a random order
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. code-block:: bash
+
+    pip install pytest-random-order
+
+Important: the presence of ``pytest-random-order`` will automatically
+randomize tests, no configuration change or command line options is
+required.
+
+As explained earlier this allows detection of coupled tests - where one
+test's state affects the state of another. When ``pytest-random-order``
+is installed it will print the random seed it used for that session,
+e.g:
+
+.. code-block:: bash
+
+   pytest tests
+   [...]
+   Using --random-order-bucket=module
+   Using --random-order-seed=573663
+
+So that if the given particular sequence fails, you can reproduce it by
+adding that exact seed, e.g.:
+
+.. code-block:: bash
+
+   pytest --random-order-seed=573663
+   [...]
+   Using --random-order-bucket=module
+   Using --random-order-seed=573663
+
+It will only reproduce the exact order if you use the exact same list of
+tests (or no list at all). Once you start to manually narrowing
+down the list you can no longer rely on the seed, but have to list them
+manually in the exact order they failed and tell pytest to not randomize
+them instead using ``--random-order-bucket=none``, e.g.:
+
+.. code-block:: bash
+
+   pytest --random-order-bucket=none tests/test_a.py tests/test_c.py tests/test_b.py
+
+To disable the shuffling for all tests:
+
+.. code-block:: bash
+
+    pytest --random-order-bucket=none
+
+By default ``--random-order-bucket=module`` is implied, which will
+shuffle the files on the module levels. It can also shuffle on
+``class``, ``package``, ``global`` and ``none`` levels. For the complete
+details please see its `documentation <https://github.com/jbasko/pytest-random-order>`__.
+
+Another randomization alternative is: ``pytest-randomly`` <https://github.com/pytest-dev/pytest-randomly>`__. This module has a very similar functionality/interface, but it doesn't have the bucket modes available in ``pytest-random-order``. It has the same problem of imposing itself once installed.
+
+Look and feel variations
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+pytest-sugar
+^^^^^^^^^^^^
+
+`pytest-sugar <https://github.com/Frozenball/pytest-sugar>`__ is a
+plugin that improves the look-n-feel, adds a progressbar, and show tests
+that fail and the assert instantly. It gets activated automatically upon
+installation.
+
+.. code-block:: bash
+                
+   pip install pytest-sugar
+
+To run tests without it, run:
+
+.. code-block:: bash
+
+    pytest -p no:sugar
+
+or uninstall it.
+
+
+
+Report each sub-test name and its progress
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+For a single or a group of tests via ``pytest`` (after
+``pip install pytest-pspec``):
+
+.. code-block:: bash
+
+   pytest --pspec tests/test_optimization.py 
+
+
+
+Instantly shows failed tests
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+`pytest-instafail <https://github.com/pytest-dev/pytest-instafail>`__
+shows failures and errors instantly instead of waiting until the end of
+test session.
+
+.. code-block:: bash
+
+    pip install pytest-instafail
+
+.. code-block:: bash
+
+    pytest --instafail
+
+To GPU or not to GPU
+~~~~~~~~~~~~~~~~~~~~
+
+On a GPU-enabled setup, to test in CPU-only mode add ``CUDA_VISIBLE_DEVICES=""``:
+
+.. code-block:: bash
+                
+    CUDA_VISIBLE_DEVICES="" pytest tests/test_logging.py
+
+or if you have multiple gpus, you can tell which one to use in this test session, e.g. to use only the second gpu if you have gpus ``0`` and ``1``, you can run:
+
+.. code-block:: bash
+                
+    CUDA_VISIBLE_DEVICES="1" pytest tests/test_logging.py
+
+This is handy when you want to run different tasks on different GPUs.
+    
+And we have these decorators that require the condition described by the marker.
+
+```
+@require_torch
+@require_tf
+@require_multigpu
+@require_non_multigpu
+@require_torch_tpu
+@require_torch_and_cuda
+```
+
+This section will be expanded soon once our work in progress on those decorators is finished.
+
+Inside tests:
+
+* How many GPUs are available:
+
+.. code-block:: bash
+
+   torch.cuda.device_count()
+
+
+   
+
+
+Output capture
+~~~~~~~~~~~~~~
+
+During test execution any output sent to ``stdout`` and ``stderr`` is
+captured. If a test or a setup method fails, its according captured
+output will usually be shown along with the failure traceback.
+
+To disable output capturing and to get the ``stdout`` and ``stderr``
+normally, use ``-s`` or ``--capture=no``:
+
+.. code-block:: bash
+
+   pytest -s tests/test_logging.py
+
+To send test results to JUnit format output:
+
+.. code-block:: bash
+
+   py.test tests --junitxml=result.xml
+
+
+Color control
+~~~~~~~~~~~~~
+
+To have no color (e.g., yellow on white background is not readable):
+
+.. code-block:: bash
+
+   pytest --color=no tests/test_logging.py
+
+
+
+Sending test report to online pastebin service
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Creating a URL for each test failure:
+
+.. code-block:: bash
+
+   pytest --pastebin=failed tests/test_logging.py
+
+This will submit test run information to a remote Paste service and
+provide a URL for each failure. You may select tests as usual or add for
+example -x if you only want to send one particular failure.
+
+Creating a URL for a whole test session log:
+
+.. code-block:: bash
+
+   pytest --pastebin=all tests/test_logging.py
+
+
+
+Writing tests
+-------------
+
+🤗 transformers tests are based on ``unittest``, but run by ``pytest``, so most of the time features from both systems can be used.
+
+You can read `here <https://docs.pytest.org/en/stable/unittest.html>`__ which features are supported, but the important thing to remember is that most ``pytest`` fixtures don't work. Neither parametrization, but we use the module ``parameterized`` that works in a similar way.
+
+
+Parametrization
+~~~~~~~~~~~~~~~
+
+Often, there is a need to run the same test multiple times, but with different arguments. It could be done from within the test, but then there is no way of running that test for just one set of arguments.
+
+.. code-block:: python
+                
+    # test_this1.py
+    import unittest
+    from parameterized import parameterized
+    class TestMathUnitTest(unittest.TestCase):
+        @parameterized.expand([
+            ("negative", -1.5, -2.0),
+            ("integer", 1, 1.0),
+            ("large fraction", 1.6, 1),
+        ])
+        def test_floor(self, name, input, expected):
+            assert_equal(math.floor(input), expected)
+
+Now, by default this test will be run 3 times, each time with the last 3 arguments of ``test_floor`` being assigned the corresponding arguments in the parameter list.
+
+and you could run just the ``negative`` and ``integer`` sets of params with:
+
+.. code-block:: bash
+
+   pytest -k "negative and integer" tests/test_mytest.py
+
+or all but ``negative`` sub-tests, with:
+
+.. code-block:: bash
+
+   pytest -k "not negative" tests/test_mytest.py
+
+Besides using the ``-k`` filter that was just mentioned, you can find out the exact name of each sub-test and run any or all of them using their exact names. 
+        
+.. code-block:: bash
+                
+    pytest test_this1.py --collect-only -q
+
+and it will list:
+                
+.. code-block:: bash
+
+    test_this1.py::TestMathUnitTest::test_floor_0_negative
+    test_this1.py::TestMathUnitTest::test_floor_1_integer
+    test_this1.py::TestMathUnitTest::test_floor_2_large_fraction
+
+So now you can run just 2 specific sub-tests:
+
+.. code-block:: bash
+
+    pytest test_this1.py::TestMathUnitTest::test_floor_0_negative  test_this1.py::TestMathUnitTest::test_floor_1_integer
+   
+The module `parameterized <https://pypi.org/project/parameterized/>`__ which is already in the developer dependencies of ``transformers`` works for both: ``unittests`` and ``pytest`` tests.
+
+If, however, the test is not a ``unittest``, you may use ``pytest.mark.parametrize`` (or you may see it being used in some existing tests, mostly under ``examples``).
+
+Here is the same example, this time using ``pytest``'s ``parametrize`` marker:
+
+.. code-block:: python
+
+    # test_this2.py
+    import pytest
+    @pytest.mark.parametrize(
+        "name, input, expected",
+        [
+            ("negative", -1.5, -2.0),
+            ("integer", 1, 1.0),
+            ("large fraction", 1.6, 1),
+        ],
+    )
+    def test_floor(name, input, expected):
+        assert_equal(math.floor(input), expected)
+
+Same as with ``parameterized``, with ``pytest.mark.parametrize`` you can have a fine control over which sub-tests are run, if the ``-k`` filter doesn't do the job. Except, this parametrization function creates a slightly different set of names for the sub-tests. Here is what they look like:
+        
+.. code-block:: bash
+                
+    pytest test_this2.py --collect-only -q
+
+and it will list:
+                
+.. code-block:: bash
+
+    test_this2.py::test_floor[integer-1-1.0]
+    test_this2.py::test_floor[negative--1.5--2.0]
+    test_this2.py::test_floor[large fraction-1.6-1]       
+
+So now you can run just the specific test:
+
+.. code-block:: bash
+
+    pytest test_this2.py::test_floor[negative--1.5--2.0] test_this2.py::test_floor[integer-1-1.0]
+
+as in the previous example.
+
+    
+
+Temporary files and directories
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Using unique temporary files and directories are essential for parallel test running, so that the tests won't overwrite each other's data. Also we want to get the temp files and directories removed at the end of each test that created them. Therefore, using packages like ``tempfile``, which address these needs is essential.
+
+However, when debugging tests, you need to be able to see what goes into the temp file or directory and you want to know it's exact path and not having it randomized on every test re-run.
+
+A helper class :obj:`transformers.test_utils.TestCasePlus` is best used for such purposes. It's a sub-class of :obj:`unittest.TestCase`, so we can easily inherit from it in the test modules.
+
+Here is an example of its usage:
+
+.. code-block:: python
+
+    from transformers.testing_utils import TestCasePlus
+    class ExamplesTests(TestCasePlus):
+    def test_whatever(self):
+        tmp_dir = self.get_auto_remove_tmp_dir()
+
+This code creates a unique temporary directory, and sets :obj:`tmp_dir` to its location.
+
+In this and all the following scenarios the temporary directory will be auto-removed at the end of test, unless ``after=False`` is passed to the helper function.
+
+* Create a temporary directory of my choice and delete it at the end - useful for debugging when you want to monitor a specific directory:
+
+.. code-block:: python
+
+    def test_whatever(self):
+        tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test")
+
+* Create a temporary directory of my choice and do not delete it at the end---useful for when you want to look at the temp results:
+
+.. code-block:: python
+
+    def test_whatever(self):
+        tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test", after=False)
+
+* Create a temporary directory of my choice and ensure to delete it right away---useful for when you disabled deletion in the previous test run and want to make sure the that temporary directory is empty before the new test is run:
+
+.. code-block:: python
+
+   def test_whatever(self):
+        tmp_dir = self.get_auto_remove_tmp_dir(tmp_dir="./tmp/run/test", before=True)
+
+.. note::
+   In order to run the equivalent of ``rm -r`` safely, only subdirs of the project repository checkout are allowed if an explicit obj:`tmp_dir` is used, so that by mistake no ``/tmp`` or similar important part of the filesystem will get nuked. i.e. please always pass paths that start with ``./``.
+
+.. note::
+   Each test can register multiple temporary directories and they all will get auto-removed, unless requested otherwise.
+
+
+Skipping tests
+~~~~~~~~~~~~~~
+
+This is useful when a bug is found and a new test is written, yet the
+bug is not fixed yet. In order to be able to commit it to the main
+repository we need make sure it's skipped during ``make test``.
+
+Methods:
+
+-  A **skip** means that you expect your test to pass only if some
+   conditions are met, otherwise pytest should skip running the test
+   altogether. Common examples are skipping windows-only tests on
+   non-windows platforms, or skipping tests that depend on an external
+   resource which is not available at the moment (for example a
+   database).
+
+-  A **xfail** means that you expect a test to fail for some reason. A
+   common example is a test for a feature not yet implemented, or a bug
+   not yet fixed. When a test passes despite being expected to fail
+   (marked with pytest.mark.xfail), it’s an xpass and will be reported
+   in the test summary.
+
+One of the important differences between the two is that ``skip``
+doesn't run the test, and ``xfail`` does. So if the code that's buggy
+causes some bad state that will affect other tests, do not use
+``xfail``.
+
+Implementation
+^^^^^^^^^^^^^^
+
+- Here is how to skip whole test unconditionally:
+
+.. code-block:: python
+
+    @unittest.skip("this bug needs to be fixed")
+    def test_feature_x():
+
+or via pytest:
+
+.. code-block:: python
+
+    @pytest.mark.skip(reason="this bug needs to be fixed")
+
+or the ``xfail`` way:
+
+.. code-block:: python
+
+    @pytest.mark.xfail
+    def test_feature_x():
+
+Here is how to skip a test based on some internal check inside the test:
+
+.. code-block:: python
+
+    def test_feature_x():
+        if not has_something():
+            pytest.skip("unsupported configuration")
+
+or the whole module:
+
+.. code-block:: python
+
+    import pytest
+    if not pytest.config.getoption("--custom-flag"):
+        pytest.skip("--custom-flag is missing, skipping tests", allow_module_level=True)
+
+or the ``xfail`` way:
+
+.. code-block:: python
+
+    def test_feature_x():
+        pytest.xfail("expected to fail until bug XYZ is fixed")
+
+Here is how to skip all tests in a module if some import is missing:
+
+.. code-block:: python
+
+    docutils = pytest.importorskip("docutils", minversion="0.3")
+
+-  Skip a test based on a condition:
+
+.. code-block:: python
+
+    @pytest.mark.skipif(sys.version_info < (3,6), reason="requires python3.6 or higher")
+    def test_feature_x():
+
+or:
+
+.. code-block:: python
+
+    @unittest.skipIf(torch_device == "cpu", "Can't do half precision")
+    def test_feature_x():
+   
+or skip the whole module:
+
+.. code-block:: python
+
+    @pytest.mark.skipif(sys.platform == 'win32', reason="does not run on windows")
+    class TestClass():
+        def test_feature_x(self):
+
+More details, example and ways are `here <https://docs.pytest.org/en/latest/skipping.html>`__.
+
+Custom markers
+~~~~~~~~~~~~~~
+
+* Slow tests
+
+Tests that are too slow (e.g. once downloading huge model files) are marked with:
+
+.. code-block:: python
+
+    from transformers.testing_utils import slow
+    @slow
+    def test_integration_foo():
+
+To run such tests set ``RUN_SLOW=1`` env var, e.g.:
+
+.. code-block:: bash
+
+    RUN_SLOW=1 pytest tests
+    
+It's important that the decorator ``@slow`` appears last in the stack of decorators, as some decorators like ``parametrized`` may interfere with its normal functioning. Here is an example of the correct usage:
+
+.. code-block:: python
+
+    @parameterized.expand(...)
+    @slow
+    def test_integration_foo():
+
+Testing the stdout/stderr output
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In order to test functions that write to ``stdout`` and/or ``stderr``,
+the test can access those streams using the ``pytest``'s `capsys
+system <https://docs.pytest.org/en/latest/capture.html>`__. Here is how
+this is accomplished:
+
+.. code-block:: python
+
+    import sys
+    def print_to_stdout(s): print(s)
+    def print_to_stderr(s): sys.stderr.write(s)
+    def test_result_and_stdout(capsys):
+        msg = "Hello"
+        print_to_stdout(msg)
+        print_to_stderr(msg)
+        out, err = capsys.readouterr() # consume the captured output streams
+        # optional: if you want to replay the consumed streams:
+        sys.stdout.write(out)
+        sys.stderr.write(err)
+        # test:
+        assert msg in out
+        assert msg in err
+
+And, of course, most of the time, ``stderr`` will come as a part of an
+exception, so try/except has to be used in such a case:
+
+.. code-block:: python
+
+    def raise_exception(msg): raise ValueError(msg)
+    def test_something_exception():
+        msg = "Not a good value"
+        error = ''
+        try:
+            raise_exception(msg)
+        except Exception as e:
+            error = str(e)
+            assert msg in error, f"{msg} is in the exception:\n{error}"
+
+Another approach to capturing stdout is via ``contextlib.redirect_stdout``:
+
+.. code-block:: python
+
+    from io import StringIO
+    from contextlib import redirect_stdout
+    def print_to_stdout(s): print(s)
+    def test_result_and_stdout():
+        msg = "Hello"
+        buffer = StringIO()
+        with redirect_stdout(buffer):
+            print_to_stdout(msg)
+        out = buffer.getvalue()
+        # optional: if you want to replay the consumed streams:
+        sys.stdout.write(out)
+        # test:
+        assert msg in out
+
+An important potential issue with capturing stdout is that it may
+contain ``\r`` characters that in normal ``print`` reset everything that
+has been printed so far. There is no problem with ``pytest``, but with
+``pytest -s`` these characters get included in the buffer, so to be able
+to have the test run with and without ``-s``, you have to make an extra
+cleanup to the captured output, using ``re.sub(r'~.*\r', '', buf, 0, re.M)``.
+
+But, then we have a helper context manager wrapper to automatically take
+care of it all, regardless of whether it has some ``\r``'s in it or
+not, so it's a simple:
+
+.. code-block:: python
+
+    from transformers.testing_utils import CaptureStdout
+    with CaptureStdout() as cs:
+        function_that_writes_to_stdout()
+    print(cs.out)
+
+Here is a full test example:
+
+.. code-block:: python
+
+    from transformers.testing_utils import CaptureStdout
+    msg = "Secret message\r"
+    final = "Hello World"
+    with CaptureStdout() as cs:
+        print(msg + final)
+    assert cs.out == final+"\n", f"captured: {cs.out}, expecting {final}"
+
+If you'd like to capture ``stderr`` use the :obj:`CaptureStderr` class
+instead:
+
+.. code-block:: python
+
+    from transformers.testing_utils import CaptureStderr
+    with CaptureStderr() as cs:
+        function_that_writes_to_stderr()
+    print(cs.err)
+
+If you need to capture both streams at once, use the parent
+:obj:`CaptureStd` class:
+
+.. code-block:: python
+
+    from transformers.testing_utils import CaptureStd
+    with CaptureStd() as cs:
+        function_that_writes_to_stdout_and_stderr()
+    print(cs.err, cs.out)
+
+
+
+Capturing logger stream
+~~~~~~~~~~~~~~~~~~~~~~~
+
+If you need to validate the output of a logger, you can use :obj:`CaptureLogger`:
+
+.. code-block:: python
+
+    from transformers import logging
+    from transformers.testing_utils import CaptureLogger
+
+    msg = "Testing 1, 2, 3"
+    logging.set_verbosity_info()
+    logger = logging.get_logger("transformers.tokenization_bart")
+    with CaptureLogger(logger) as cl:
+        logger.info(msg)
+    assert cl.out, msg+"\n"
+
+
+Testing with environment variables
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+If you want to test the impact of environment variables for a specific test you can use a helper decorator ``transformers.testing_utils.mockenv``
+
+.. code-block:: python
+
+    from transformers.testing_utils import mockenv
+    class HfArgumentParserTest(unittest.TestCase):
+        @mockenv(TRANSFORMERS_VERBOSITY="error")
+        def test_env_override(self):
+            env_level_str = os.getenv("TRANSFORMERS_VERBOSITY", None)
+
+
+Getting reproducible results
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+In some situations you may want to remove randomness for your tests. To
+get identical reproducable results set, you will need to fix the seed:
+
+.. code-block:: python
+
+    seed = 42
+
+    # python RNG
+    import random
+    random.seed(seed)
+
+    # pytorch RNGs
+    import torch
+    torch.manual_seed(seed)
+    torch.backends.cudnn.deterministic = True
+    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
+
+    # numpy RNG
+    import numpy as np
+    np.random.seed(seed)
+
+    # tf RNG
+    tf.random.set_seed(seed)
+
+Debugging tests
+~~~~~~~~~~~~~~~
+
+To start a debugger at the point of the warning, do this:
+
+.. code-block:: bash
+
+    pytest tests/test_logging.py -W error::UserWarning --pdb
--- a/docs/source/tokenizer_summary.rst
+++ b/docs/source/tokenizer_summary.rst
@@ -0,0 +1,243 @@
+Tokenizer summary
+-----------------
+
+In this page, we will have a closer look at tokenization. As we saw in
+:doc:`the preprocessing tutorial <preprocessing>`, tokenizing a text is splitting it into words or subwords, which then
+are converted to ids. The second part is pretty straightforward, here we will focus on the first part. More
+specifically, we will look at the three main different kinds of tokenizers used in 🤗 Transformers:
+:ref:`Byte-Pair Encoding (BPE) <byte-pair-encoding>`, :ref:`WordPiece <wordpiece>` and
+:ref:`SentencePiece <sentencepiece>`, and provide examples of models using each of those.
+
+Note that on each model page, you can look at the documentation of the associated tokenizer to know which of those
+algorithms the pretrained model used. For instance, if we look at :class:`~transformers.BertTokenizer`, we can see it's
+using :ref:`WordPiece <wordpiece>`.
+
+Introduction to tokenization
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Splitting a text in smaller chunks is a task that's harder than it looks, and there are multiple ways of doing it. For
+instance, let's look at the sentence "Don't you love 🤗 Transformers? We sure do." A first simple way of tokenizing
+this text is just to split it by spaces, which would give:
+
+::
+
+    ["Don't", "you", "love", "🤗", "Transformers?", "We", "sure", "do."]
+
+This is a nice first step, but if we look at the tokens "Transformers?" or "do.", we can see we can do better. Those
+will be different than the tokens "Transformers" and "do" for our model, so we should probably take the punctuation
+into account. This would give:
+
+::
+
+    ["Don", "'", "t", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]
+
+which is better already. One thing that is annoying though is how it dealt with "Don't". "Don't" stands for do not, so
+it should probably be better tokenized as ``["Do", "n't"]``. This is where things start getting more complicated, and
+part of the reason each kind of model has its own tokenizer class. Depending on the rules we apply to split our texts
+into tokens, we'll get different tokenized versions of the same text. And of course, a given pretrained model won't
+perform properly if you don't use the exact same rules as the persons who pretrained it.
+
+`spaCy <https://spacy.io/>`__ and `Moses <http://www.statmt.org/moses/?n=Development.GetStarted>`__ are two popular
+rule-based tokenizers. On the text above, they'd output something like:
+
+::
+
+    ["Do", "n't", "you", "love", "🤗", "Transformers", "?", "We", "sure", "do", "."]
+
+Space/punctuation-tokenization and rule-based tokenization are both examples of word tokenization, which is splitting a
+sentence into words. While it's the most intuitive way to separate texts in smaller chunks, it can have a problem when
+you have a huge corpus: it usually yields a very big vocabulary (the set of all unique tokens used).
+:doc:`Transformer XL <model_doc/transformerxl>` for instance uses space/punctuation-tokenization, and has a vocabulary
+size of 267,735!
+
+A huge vocabulary size means a huge embedding matrix at the start of the model, which will cause memory problems.
+TransformerXL deals with it by using a special kind of embeddings called adaptive embeddings, but in general,
+transformers models rarely have a vocabulary size greater than 50,000, especially if they are trained on a single
+language.
+
+So if tokenizing on words is unsatisfactory, we could go on the opposite direction and simply tokenize on characters.
+While it's very simple and would save a lot of memory, this doesn't allow the model to learn representations of texts
+as meaningful as when using a word tokenization, leading to a loss of performance. So to get the best of both worlds,
+all transformers models use a hybrid between word-level and character-level tokenization called subword tokenization.
+
+Subword tokenization
+^^^^^^^^^^^^^^^^^^^^
+
+Subword tokenization algorithms rely on the principle that most common words should be left as is, but rare words
+should be decomposed in meaningful subword units. For instance "annoyingly" might be considered a rare word and
+decomposed as "annoying" and "ly". This is especially useful in agglutinative languages such as Turkish, where you can
+form (almost) arbitrarily long complex words by stringing together some subwords.
+
+This allows the model to keep a reasonable vocabulary while still learning useful representations for common words or
+subwords. This also enables the model to process words it has never seen before, by decomposing them into
+subwords it knows. For instance, the base :class:`~transformers.BertTokenizer` will tokenize "I have a new GPU!" like
+this:
+
+.. code-block::
+
+    >>> from transformers import BertTokenizer
+    >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    >>> tokenizer.tokenize("I have a new GPU!")
+    ['i', 'have', 'a', 'new', 'gp', '##u', '!']
+
+Since we are considering the uncased model, the sentence was lowercased first. Then all the words were present in the
+vocabulary of the tokenizer, except for "gpu", so the tokenizer split it in subwords it knows: "gp" and "##u". The "##"
+means that the rest of the token should be attached to the previous one, without space (for when we need to decode
+predictions and reverse the tokenization).
+
+Another example is when we use the base :class:`~transformers.XLNetTokenizer` to tokenize our previous text:
+
+.. code-block::
+
+    >>> from transformers import XLNetTokenizer
+    >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased')
+    >>> tokenizer.tokenize("Don't you love 🤗 Transformers? We sure do.")
+    ['▁Don', "'", 't', '▁you', '▁love', '▁', '🤗', '▁', 'Transform', 'ers', '?', '▁We', '▁sure', '▁do', '.']
+
+We'll get back to the meaning of those '▁' when we look at :ref:`SentencePiece <sentencepiece>` but you can see
+Transformers has been split into "Transform" and "ers".
+
+Let's now look at how the different subword tokenization algorithms work. Note that they all rely on some form of
+training which is usually done on the corpus the corresponding model will be trained on.
+
+.. _byte-pair-encoding:
+
+Byte-Pair Encoding
+~~~~~~~~~~~~~~~~~~
+
+Byte-Pair Encoding was introduced in `this paper <https://arxiv.org/abs/1508.07909>`__. It relies on a pretokenizer
+splitting the training data into words, which can be a simple space tokenization
+(:doc:`GPT-2 <model_doc/gpt2>` and :doc:`Roberta <model_doc/roberta>` uses this for instance) or a rule-based tokenizer
+(:doc:`XLM <model_doc/xlm>` use Moses for most languages, as does :doc:`FlauBERT <model_doc/flaubert>`),
+
+:doc:`GPT <model_doc/gpt>` uses Spacy and ftfy, and counts the frequency of each word in the training corpus.
+
+It then begins from the list of all characters, and will learn merge rules to form a new token from two symbols in the
+vocabulary until it has learned a vocabulary of the desired size (this is a hyperparameter to pick).
+
+Let's say that after the pre-tokenization we have the following words (the number indicating the frequency of each
+word):
+
+::
+
+    ('hug', 10), ('pug', 5), ('pun', 12), ('bun', 4), ('hugs', 5)
+
+Then the base vocabulary is ['b', 'g', 'h', 'n', 'p', 's', 'u'] and all our words are first split by character:
+
+::
+
+    ('h' 'u' 'g', 10), ('p' 'u' 'g', 5), ('p' 'u' 'n', 12), ('b' 'u' 'n', 4), ('h' 'u' 'g' 's', 5)
+
+We then take each pair of symbols and look at the most frequent. For instance 'hu' is present `10 + 5 = 15` times (10
+times in the 10 occurrences of 'hug', 5 times in the 5 occurrences of 'hugs'). The most frequent here is 'ug', present
+`10 + 5 + 5 = 20` times in total. So the first merge rule the tokenizer learns is to group all 'u' and 'g' together
+then it adds 'ug' to the vocabulary. Our corpus then becomes
+
+::
+
+    ('h' 'ug', 10), ('p' 'ug', 5), ('p' 'u' 'n', 12), ('b' 'u' 'n', 4), ('h' 'ug' 's', 5)
+
+and we continue by looking at the next most common pair of symbols. It's 'un', present 16 times, so we merge those two
+and add 'un' to the vocabulary. Then it's 'hug' (as 'h' + 'ug'), present 15 times, so we merge those two and add 'hug'
+to the vocabulary.
+
+At this stage, the vocabulary is ``['b', 'g', 'h', 'n', 'p', 's', 'u', 'ug', 'un', 'hug']`` and our corpus is
+represented as
+
+::
+
+    ('hug', 10), ('p' 'ug', 5), ('p' 'un', 12), ('b' 'un', 4), ('hug' 's', 5)
+
+If we stop there, the tokenizer can apply the rules it learned to new words (as long as they don't contain characters that
+were not in the base vocabulary). For instance 'bug' would be tokenized as ``['b', 'ug']`` but mug would be tokenized as
+``['<unk>', 'ug']`` since the 'm' is not in the base vocabulary. This doesn't happen to letters in general (since the
+base corpus uses all of them), but to special characters like emojis.
+
+As we said before, the vocabulary size (which is the base vocabulary size + the number of merges) is a hyperparameter
+to choose. For instance :doc:`GPT <model_doc/gpt>` has a vocabulary size of 40,478 since they have 478 base characters
+and chose to stop the training of the tokenizer at 40,000 merges.
+
+Byte-level BPE
+^^^^^^^^^^^^^^
+
+To deal with the fact the base vocabulary needs to get all base characters, which can be quite big if one allows for
+all unicode characters, the
+`GPT-2 paper <https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`__
+introduces a clever trick, which is to use bytes as the base vocabulary (which gives a size of 256). With some
+additional rules to deal with punctuation, this manages to be able to tokenize every text without needing an unknown
+token. For instance, the :doc:`GPT-2 model <model_doc/gpt>` has a vocabulary size of 50,257, which corresponds to the
+256 bytes base tokens, a special end-of-text token and the symbols learned with 50,000 merges.
+
+.. _wordpiece:
+
+WordPiece
+=========
+
+WordPiece is the subword tokenization algorithm used for :doc:`BERT <model_doc/bert>` (as well as
+:doc:`DistilBERT <model_doc/distilbert>` and :doc:`Electra <model_doc/electra>`) and was outlined in
+`this paper <https://static.googleusercontent.com/media/research.google.com/ja//pubs/archive/37842.pdf>`__. It relies
+on the same base as BPE, which is to initialize the vocabulary to every character present in the corpus and
+progressively learn a given number of merge rules, the difference is that it doesn't choose the pair that is the most
+frequent but the one that will maximize the likelihood on the corpus once merged.
+
+What does this mean? Well, in the previous example, it means we would only merge 'u' and 'g' if the probability of
+having 'ug' divided by the probability of having 'u' then 'g' is greater than for any other pair of symbols. It's
+subtly different from what BPE does in the sense that it evaluates what it "loses" by merging two symbols and makes
+sure it's `worth it`.
+
+.. _unigram:
+
+Unigram
+=======
+
+Unigram is a subword tokenization algorithm introduced in `this paper <https://arxiv.org/pdf/1804.10959.pdf>`__.
+Instead of starting with a group of base symbols and learning merges with some rule, like BPE or WordPiece, it starts
+from a large vocabulary (for instance, all pretokenized words and the most common substrings) that it will trim down
+progressively. It's not used directly for any of the pretrained models in the library, but it's used in conjunction
+with :ref:`SentencePiece <sentencepiece>`.
+
+More specifically, at a given step, unigram computes a loss from the corpus we have and the current vocabulary, then,
+for each subword, evaluate how much the loss would augment if the subword was removed from the vocabulary. It then
+sorts the subwords by this quantity (that represents how worse the loss becomes if the token is removed) and removes
+all the worst p tokens (for instance p could be 10% or 20%). It then repeats the process until the vocabulary has
+reached the desired size, always keeping the base characters (to be able to tokenize any word written with them, like
+BPE or WordPiece).
+
+Contrary to BPE and WordPiece that work out rules in a certain order that you can then apply in the same order when
+tokenizing new text, Unigram will have several ways of tokenizing a new text. For instance, if it ends up with the
+vocabulary
+
+::
+
+    ['b', 'g', 'h', 'n', 'p', 's', 'u', 'ug', 'un', 'hug']
+
+we had before, it could tokenize "hugs" as ``['hug', 's']``, ``['h', 'ug', 's']`` or ``['h', 'u', 'g', 's']``. So which
+one choose? On top of saving the vocabulary, the trained tokenizer will save the probability of each token in the
+training corpus. You can then give a probability to each tokenization (which is the product of the probabilities of the
+tokens forming it) and pick the most likely one (or if you want to apply some data augmentation, you could sample one
+of the tokenization according to their probabilities).
+
+Those probabilities define the loss that trains the tokenizer: if our corpus consists of the
+words :math:`x_{1}, \dots, x_{N}` and if for the word :math:`x_{i}` we note :math:`S(x_{i})` the set of all possible
+tokenizations of :math:`x_{i}` (with the current vocabulary), then the loss is defined as
+
+.. math::
+    \mathcal{L} = -\sum_{i=1}^{N} \log \left ( \sum_{x \in S(x_{i})} p(x) \right )
+
+.. _sentencepiece:
+
+SentencePiece
+=============
+
+All the methods we have been looking at so far required some form of pretokenization, which has a central problem: not
+all languages use spaces to separate words. This is a problem :doc:`XLM <model_doc/xlm>` solves by using specific
+pretokenizers for each of those languages (in this case, Chinese, Japanese and Thai). To solve this problem,
+SentencePiece (introduced in `this paper <https://arxiv.org/pdf/1808.06226.pdf>`__) treats the input as a raw stream,
+includes the space in the set of characters to use, then uses BPE or unigram to construct the appropriate vocabulary.
+
+That's why in the example we saw before using :class:`~transformers.XLNetTokenizer` (which uses SentencePiece), we had
+the '▁' character, that represents space. Decoding a tokenized text is then super easy: we just have to concatenate
+all of them together and replace '▁' with space.
+
+All transformers models in the library that use SentencePiece use it with unigram. Examples of models using it are
+:doc:`ALBERT <model_doc/albert>`, :doc:`XLNet <model_doc/xlnet>` or the :doc:`Marian framework <model_doc/marian>`.
--- a/docs/source/torchscript.rst
+++ b/docs/source/torchscript.rst
@@ -1,135 +0,0 @@
-TorchScript
-================================================
-
-.. note::
-    This is the very beginning of our experiments with TorchScript and we are still exploring its capabilities
-    with variable-input-size models. It is a focus of interest to us and we will deepen our analysis in upcoming
-    releases, with more code examples, a more flexible implementation, and benchmarks comparing python-based codes
-    with compiled TorchScript.
-
-
-According to Pytorch's documentation: "TorchScript is a way to create serializable and optimizable models from PyTorch code".
-Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
-their model to be re-used in other programs, such as efficiency-oriented C++ programs.
-
-We have provided an interface that allows the export of `transformers` models to TorchScript so that they can
-be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
-they can be exported, and what to be mindful of when using these models with TorchScript.
-
-Exporting a model needs two things:
-
-* dummy inputs to execute a model forward pass.
-* the model needs to be instantiated with the ``torchscript`` flag.
-
-These necessities imply several things developers should be careful about. These are detailed below.
-
-
-Implications
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-TorchScript flag and tied weights
------------------------------------------------
-This flag is necessary because most of the language models in this repository have tied weights between their
-``Embedding`` layer and their ``Decoding`` layer. TorchScript does not allow the export of models that have tied weights,
-it is therefore necessary to untie the weights beforehand.
-
-This implies that models instantiated with the ``torchscript`` flag have their ``Embedding`` layer and ``Decoding`` layer
-separate, which means that they should not be trained down the line. Training would de-synchronize the two layers,
-leading to unexpected results.
-
-This is not the case for models that do not have a Language Model head, as those do not have tied weights. These models
-can be safely exported without the ``torchscript`` flag.
-
-Dummy inputs and standard lengths
------------------------------------------------
-
-The dummy inputs are used to do a model forward pass. While the inputs' values are propagating through the layers,
-Pytorch keeps track of the different operations executed on each tensor. These recorded operations are then used
-to create the "trace" of the model.
-
-The trace is created relatively to the inputs' dimensions. It is therefore constrained by the dimensions of the dummy
-input, and will not work for any other sequence length or batch size. When trying with a different size, an error such
-as:
-
-``The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2``
-
-will be raised. It is therefore recommended to trace the model with a dummy input size at least as large as the largest
-input that will be fed to the model during inference. Padding can be performed to fill the missing values. As the model
-will have been traced with a large input size however, the dimensions of the different matrix will be large as well,
-resulting in more calculations.
-
-It is recommended to be careful of the total number of operations done on each input and to follow performance closely
-when exporting varying sequence-length models.
-
-Using TorchScript in Python
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Below are examples of using the Python to save, load models as well as how to use the trace for inference.
-
-Saving a model
------------------------------------------------
-
-This snippet shows how to use TorchScript to export a ``BertModel``. Here the ``BertModel`` is instantiated
-according to a ``BertConfig`` class and then saved to disk under the filename ``traced_bert.pt``
-
-.. code-block:: python
-
-    from transformers import BertModel, BertTokenizer, BertConfig
-    import torch
-
-    enc = BertTokenizer.from_pretrained("bert-base-uncased")
-
-    # Tokenizing input text
-    text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-    tokenized_text = enc.tokenize(text)
-
-    # Masking one of the input tokens
-    masked_index = 8
-    tokenized_text[masked_index] = '[MASK]'
-    indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
-    segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
-
-    # Creating a dummy input
-    tokens_tensor = torch.tensor([indexed_tokens])
-    segments_tensors = torch.tensor([segments_ids])
-    dummy_input = [tokens_tensor, segments_tensors]
-
-    # Initializing the model with the torchscript flag
-    # Flag set to True even though it is not necessary as this model does not have an LM Head.
-    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
-        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, torchscript=True)
-
-    # Instantiating the model
-    model = BertModel(config)
-
-    # The model needs to be in evaluation mode
-    model.eval()
-
-    # If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
-    model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
-
-    # Creating the trace
-    traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
-    torch.jit.save(traced_model, "traced_bert.pt")
-
-Loading a model
------------------------------------------------
-
-This snippet shows how to load the ``BertModel`` that was previously saved to disk under the name ``traced_bert.pt``.
-We are re-using the previously initialised ``dummy_input``.
-
-.. code-block:: python
-
-    loaded_model = torch.jit.load("traced_model.pt")
-    loaded_model.eval()
-
-    all_encoder_layers, pooled_output = loaded_model(dummy_input)
-
-Using a traced model for inference
------------------------------------------------
-
-Using the traced model for inference is as simple as using its ``__call__`` dunder method:
-
-.. code-block:: python
-
-    traced_model(tokens_tensor, segments_tensors)
--- a/docs/source/training.rst
+++ b/docs/source/training.rst
@@ -0,0 +1,318 @@
+Training and fine-tuning
+========================
+
+Model classes in 🤗 Transformers are designed to be compatible with native
+PyTorch and TensorFlow 2 and can be used seemlessly with either. In this
+quickstart, we will show how to fine-tune (or train from scratch) a model
+using the standard training tools available in either framework. We will also
+show how to use our included :func:`~transformers.Trainer` class which
+handles much of the complexity of training for you.
+
+This guide assume that you are already familiar with loading and use our
+models for inference; otherwise, see the :doc:`task summary <task_summary>`. We also assume
+that you are familiar with training deep neural networks in either PyTorch or
+TF2, and focus specifically on the nuances and tools for training models in
+🤗 Transformers.
+
+Sections:
+
+  - :ref:`pytorch`
+  - :ref:`tensorflow`
+  - :ref:`trainer`
+  - :ref:`additional-resources`
+
+.. _pytorch:
+
+Fine-tuning in native PyTorch
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Model classes in 🤗 Transformers that don't begin with ``TF`` are
+`PyTorch Modules <https://pytorch.org/docs/master/generated/torch.nn.Module.html>`_,
+meaning that you can use them just as you would any model in PyTorch for
+both inference and optimization.
+
+Let's consider the common task of fine-tuning a masked language model like
+BERT on a sequence classification dataset. When we instantiate a model with
+:func:`~transformers.PreTrainedModel.from_pretrained`, the model
+configuration and pre-trained weights
+of the specified model are used to initialize the model. The
+library also includes a number of task-specific final layers or 'heads' whose
+weights are instantiated randomly when not present in the specified
+pre-trained model. For example, instantiating a model with
+``BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)``
+will create a BERT model instance with encoder weights copied from the
+``bert-base-uncased`` model and a randomly initialized sequence
+classification head on top of the encoder with an output size of 2. Models
+are initialized in ``eval`` mode by default. We can call ``model.train()`` to
+put it in train mode.
+
+.. code-block:: python
+
+    from transformers import BertForSequenceClassification
+    model = BertForSequenceClassification.from_pretrained('bert-base-uncased', return_dict=True)
+    model.train()
+
+This is useful because it allows us to make use of the pre-trained BERT
+encoder and easily train it on whatever sequence classification dataset we
+choose. We can use any PyTorch optimizer, but our library also provides the
+:func:`~transformers.AdamW` optimizer which implements gradient bias
+correction as well as weight decay.
+
+.. code-block:: python
+
+    from transformers import AdamW
+    optimizer = AdamW(model.parameters(), lr=1e-5)
+
+The optimizer allows us to apply different hyperpameters for specific
+parameter groups. For example, we can apply weight decay to all parameters
+other than bias and layer normalization terms:
+
+.. code-block:: python
+
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=1e-5)
+    
+Now we can set up a simple dummy training batch using
+:func:`~transformers.PreTrainedTokenizer.__call__`. This returns a
+:func:`~transformers.BatchEncoding` instance which
+prepares everything we might need to pass to the model.
+
+.. code-block:: python
+
+    from transformers import BertTokenizer
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    text_batch = ["I love Pixar.", "I don't care for Pixar."]
+    encoding = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True)
+    input_ids = encoding['input_ids']
+    attention_mask = encoding['attention_mask']
+
+When we call a classification model with the ``labels`` argument, the first
+returned element is the Cross Entropy loss between the predictions and the
+passed labels. Having already set up our optimizer, we can then do a
+backwards pass and update the weights:
+
+.. code-block:: python
+
+    labels = torch.tensor([1,0]).unsqueeze(0)
+    outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
+    loss = outputs.loss
+    loss.backward()
+    optimizer.step()
+
+Alternatively, you can just get the logits and calculate the loss yourself.
+The following is equivalent to the previous example:
+
+.. code-block:: python
+
+    from torch.nn import functional as F
+    labels = torch.tensor([1,0]).unsqueeze(0)
+    outputs = model(input_ids, attention_mask=attention_mask)
+    loss = F.cross_entropy(labels, outputs.logitd)
+    loss.backward()
+    optimizer.step()
+
+Of course, you can train on GPU by calling ``to('cuda')`` on the model and
+inputs as usual.
+
+We also provide a few learning rate scheduling tools. With the following, we
+can set up a scheduler which warms up for ``num_warmup_steps`` and then
+linearly decays to 0 by the end of training.
+
+.. code-block:: python
+
+    from transformers import get_linear_schedule_with_warmup
+    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps, num_train_steps)
+
+Then all we have to do is call ``scheduler.step()`` after ``optimizer.step()``.
+
+.. code-block:: python
+
+    loss.backward()
+    optimizer.step()
+    scheduler.step()
+
+We highly recommend using :func:`~transformers.Trainer`, discussed below,
+which conveniently handles the moving parts of training 🤗 Transformers models
+with features like mixed precision and easy tensorboard logging.
+
+
+Freezing the encoder
+--------------------
+
+In some cases, you might be interested in keeping the weights of the
+pre-trained encoder frozen and optimizing only the weights of the head
+layers. To do so, simply set the ``requires_grad`` attribute to ``False`` on
+the encoder parameters, which can be accessed with the ``base_model``
+submodule on any task-specific model in the library:
+
+.. code-block:: python
+
+    for param in model.base_model.parameters():
+        param.requires_grad = False
+
+
+.. _tensorflow:
+
+Fine-tuning in native TensorFlow 2
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Models can also be trained natively in TensorFlow 2. Just as with PyTorch,
+TensorFlow models can be instantiated with
+:func:`~transformers.PreTrainedModel.from_pretrained` to load the weights of
+the encoder from a pretrained model.
+
+.. code-block:: python
+
+    from transformers import TFBertForSequenceClassification
+    model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+Let's use ``tensorflow_datasets`` to load in the `MRPC dataset
+<https://www.tensorflow.org/datasets/catalog/glue#gluemrpc>`_ from GLUE. We
+can then use our built-in
+:func:`~transformers.data.processors.glue.glue_convert_examples_to_features`
+to tokenize MRPC and convert it to a TensorFlow ``Dataset`` object. Note that
+tokenizers are framework-agnostic, so there is no need to prepend ``TF`` to
+the pretrained tokenizer name.
+
+.. code-block:: python
+
+    from transformers import BertTokenizer, glue_convert_examples_to_features
+    import tensorflow as tf
+    import tensorflow_datasets as tfds
+    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+    data = tfds.load('glue/mrpc')
+    train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, max_length=128, task='mrpc')
+    train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
+
+The model can then be compiled and trained as any Keras model:
+
+.. code-block:: python
+
+    optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
+    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    model.compile(optimizer=optimizer, loss=loss)
+    model.fit(train_dataset, epochs=2, steps_per_epoch=115)
+
+With the tight interoperability between TensorFlow and PyTorch models, you
+can even save the model and then reload it as a PyTorch model (or vice-versa):
+
+.. code-block:: python
+
+    from transformers import BertForSequenceClassification
+    model.save_pretrained('./my_mrpc_model/')
+    pytorch_model = BertForSequenceClassification.from_pretrained('./my_mrpc_model/', from_tf=True)
+
+
+.. _trainer:
+
+Trainer
+^^^^^^^
+
+We also provide a simple but feature-complete training and evaluation
+interface through :func:`~transformers.Trainer` and
+:func:`~transformers.TFTrainer`. You can train, fine-tune,
+and evaluate any 🤗 Transformers model with a wide range of training options and
+with built-in features like logging, gradient accumulation, and mixed
+precision.
+
+.. code-block:: python
+
+    ## PYTORCH CODE
+    from transformers import BertForSequenceClassification, Trainer, TrainingArguments
+
+    model = BertForSequenceClassification.from_pretrained("bert-large-uncased")
+
+    training_args = TrainingArguments(
+        output_dir='./results',          # output directory
+        num_train_epochs=3,              # total # of training epochs
+        per_device_train_batch_size=16,  # batch size per device during training
+        per_device_eval_batch_size=64,   # batch size for evaluation
+        warmup_steps=500,                # number of warmup steps for learning rate scheduler
+        weight_decay=0.01,               # strength of weight decay
+        logging_dir='./logs',            # directory for storing logs
+    )
+
+    trainer = Trainer(
+        model=model,                         # the instantiated 🤗 Transformers model to be trained
+        args=training_args,                  # training arguments, defined above
+        train_dataset=train_dataset,         # training dataset
+        eval_dataset=test_dataset            # evaluation dataset
+    )
+    ## TENSORFLOW CODE
+    from transformers import TFBertForSequenceClassification, TFTrainer, TFTrainingArguments
+
+    model = TFBertForSequenceClassification.from_pretrained("bert-large-uncased")
+
+    training_args = TFTrainingArguments(
+        output_dir='./results',          # output directory
+        num_train_epochs=3,              # total # of training epochs
+        per_device_train_batch_size=16,  # batch size per device during training
+        per_device_eval_batch_size=64,   # batch size for evaluation
+        warmup_steps=500,                # number of warmup steps for learning rate scheduler
+        weight_decay=0.01,               # strength of weight decay
+        logging_dir='./logs',            # directory for storing logs
+    )
+
+    trainer = TFTrainer(
+        model=model,                         # the instantiated 🤗 Transformers model to be trained
+        args=training_args,                  # training arguments, defined above
+        train_dataset=tfds_train_dataset,    # tensorflow_datasets training dataset
+        eval_dataset=tfds_test_dataset       # tensorflow_datasets evaluation dataset
+    )
+
+Now simply call ``trainer.train()`` to train and ``trainer.evaluate()`` to
+evaluate. You can use your own module as well, but the first
+argument returned from ``forward`` must be the loss which you wish to
+optimize.
+
+:func:`~transformers.Trainer` uses a built-in default function to collate
+batches and prepare them to be fed into the model. If needed, you can also
+use the ``data_collator`` argument to pass your own collator function which
+takes in the data in the format provided by your dataset and returns a
+batch ready to be fed into the model. Note that
+:func:`~transformers.TFTrainer` expects the passed datasets to be dataset
+objects from ``tensorflow_datasets``.
+
+To calculate additional metrics in addition to the loss, you can also define
+your own ``compute_metrics`` function and pass it to the trainer.
+
+.. code-block:: python
+
+    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+
+    def compute_metrics(pred):
+        labels = pred.label_ids
+        preds = pred.predictions.argmax(-1)
+        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
+        acc = accuracy_score(labels, preds)
+        return {
+            'accuracy': acc,
+            'f1': f1,
+            'precision': precision,
+            'recall': recall
+        }
+
+Finally, you can view the results, including any calculated metrics, by
+launching tensorboard in your specified ``logging_dir`` directory.
+
+
+.. _additional-resources:
+
+Additional resources
+^^^^^^^^^^^^^^^^^^^^
+
+- `A lightweight colab demo <https://colab.research.google.com/drive/1-JIJlao4dI-Ilww_NnTc0rxtp-ymgDgM?usp=sharing>`_
+  which uses ``Trainer`` for IMDb sentiment classification.
+
+- `🤗 Transformers Examples <https://github.com/huggingface/transformers/tree/master/examples>`_
+  including scripts for training and fine-tuning on GLUE, SQuAD, and several other tasks.
+
+- `How to train a language model <https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/01_how_to_train.ipynb>`_,
+  a detailed colab notebook which uses ``Trainer`` to train a masked language model from scratch on Esperanto.
+
+- `🤗 Transformers Notebooks <notebooks.html>`_ which contain dozens of example notebooks from the community for
+  training and using 🤗 Transformers on a variety of tasks.
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -1,829 +0,0 @@
-Usage
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This page shows the most frequent use-cases when using the library. The models available allow for many different
-configurations and a great versatility in use-cases. The most simple ones are presented here, showcasing usage
-for tasks such as question answering, sequence classification, named entity recognition and others.
-
-These examples leverage auto-models, which are classes that will instantiate a model according to a given checkpoint,
-automatically selecting the correct model architecture. Please check the :class:`~transformers.AutoModel` documentation
-for more information.
-Feel free to modify the code to be more specific and adapt it to your specific use-case.
-
-In order for a model to perform well on a task, it must be loaded from a checkpoint corresponding to that task. These
-checkpoints are usually pre-trained on a large corpus of data and fine-tuned on a specific task. This means the
-following:
-
- Not all models were fine-tuned on all tasks. If you want to fine-tune a model on a specific task, you can leverage
-  one of the `run_$TASK.py` script in the
-  `examples <https://github.com/huggingface/transformers/tree/master/examples>`_ directory.
- Fine-tuned models were fine-tuned on a specific dataset. This dataset may or may not overlap with your use-case
-  and domain. As mentioned previously, you may leverage the
-  `examples <https://github.com/huggingface/transformers/tree/master/examples>`_ scripts to fine-tune your model, or you
-  may create your own training script.
-
-In order to do an inference on a task, several mechanisms are made available by the library:
-
- Pipelines: very easy-to-use abstractions, which require as little as two lines of code.
- Using a model directly with a tokenizer (PyTorch/TensorFlow): the full inference using the model. Less abstraction,
-  but much more powerful.
-
-Both approaches are showcased here.
-
-.. note::
-
-    All tasks presented here leverage pre-trained checkpoints that were fine-tuned on specific tasks. Loading a
-    checkpoint that was not fine-tuned on a specific task would load only the base transformer layers and not the
-    additional head that is used for the task, initializing the weights of that head randomly.
-
-    This would produce random output.
-
-Sequence Classification
--------------------------
-
-Sequence classification is the task of classifying sequences according to a given number of classes. An example
-of sequence classification is the GLUE dataset, which is entirely based on that task. If you would like to fine-tune
-a model on a GLUE sequence classification task, you may leverage the
-`run_glue.py <https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_glue.py>`_ or
-`run_tf_glue.py <https://github.com/huggingface/transformers/tree/master/examples/text-classification/run_tf_glue.py>`_ scripts.
-
-Here is an example using the pipelines do to sentiment analysis: identifying if a sequence is positive or negative.
-It leverages a fine-tuned model on sst2, which is a GLUE task.
-
-::
-
-    from transformers import pipeline
-
-    nlp = pipeline("sentiment-analysis")
-
-    print(nlp("I hate you"))
-    print(nlp("I love you"))
-
-This returns a label ("POSITIVE" or "NEGATIVE") alongside a score, as follows:
-
-::
-
-    [{'label': 'NEGATIVE', 'score': 0.9991129}]
-    [{'label': 'POSITIVE', 'score': 0.99986565}]
-
-
-Here is an example of doing a sequence classification using a model to determine if two sequences are paraphrases
-of each other. The process is the following:
-
- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
-  with the weights stored in the checkpoint.
- Build a sequence from the two sentences, with the correct model-specific separators token type ids
-  and attention masks (:func:`~transformers.PreTrainedTokenizer.encode` and
-  :func:`~transformers.PreTrainedTokenizer.encode_plus` take care of this)
- Pass this sequence through the model so that it is classified in one of the two available classes: 0
-  (not a paraphrase) and 1 (is a paraphrase)
- Compute the softmax of the result to get probabilities over the classes
- Print the results
-
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoTokenizer, AutoModelForSequenceClassification
-    import torch
-
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
-    model = AutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
-
-    classes = ["not paraphrase", "is paraphrase"]
-
-    sequence_0 = "The company HuggingFace is based in New York City"
-    sequence_1 = "Apples are especially bad for your health"
-    sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
-
-    paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="pt")
-    not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="pt")
-
-    paraphrase_classification_logits = model(**paraphrase)[0]
-    not_paraphrase_classification_logits = model(**not_paraphrase)[0]
-
-    paraphrase_results = torch.softmax(paraphrase_classification_logits, dim=1).tolist()[0]
-    not_paraphrase_results = torch.softmax(not_paraphrase_classification_logits, dim=1).tolist()[0]
-
-    print("Should be paraphrase")
-    for i in range(len(classes)):
-        print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
-
-    print("\nShould not be paraphrase")
-    for i in range(len(classes)):
-        print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
-    ## TENSORFLOW CODE
-    from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
-    import tensorflow as tf
-
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
-    model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")
-
-    classes = ["not paraphrase", "is paraphrase"]
-
-    sequence_0 = "The company HuggingFace is based in New York City"
-    sequence_1 = "Apples are especially bad for your health"
-    sequence_2 = "HuggingFace's headquarters are situated in Manhattan"
-
-    paraphrase = tokenizer.encode_plus(sequence_0, sequence_2, return_tensors="tf")
-    not_paraphrase = tokenizer.encode_plus(sequence_0, sequence_1, return_tensors="tf")
-
-    paraphrase_classification_logits = model(paraphrase)[0]
-    not_paraphrase_classification_logits = model(not_paraphrase)[0]
-
-    paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]
-    not_paraphrase_results = tf.nn.softmax(not_paraphrase_classification_logits, axis=1).numpy()[0]
-
-    print("Should be paraphrase")
-    for i in range(len(classes)):
-        print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")
-
-    print("\nShould not be paraphrase")
-    for i in range(len(classes)):
-        print(f"{classes[i]}: {round(not_paraphrase_results[i] * 100)}%")
-
-This outputs the following results:
-
-::
-
-    Should be paraphrase
-    not paraphrase: 10%
-    is paraphrase: 90%
-
-    Should not be paraphrase
-    not paraphrase: 94%
-    is paraphrase: 6%
-
-Extractive Question Answering
----------------------------------------------------
-
-Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
-question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
-a model on a SQuAD task, you may leverage the `run_squad.py`.
-
-Here is an example using the pipelines do to question answering: extracting an answer from a text given a question.
-It leverages a fine-tuned model on SQuAD.
-
-::
-
-    from transformers import pipeline
-
-    nlp = pipeline("question-answering")
-
-    context = r"""
-    Extractive Question Answering is the task of extracting an answer from a text given a question. An example of a
-    question answering dataset is the SQuAD dataset, which is entirely based on that task. If you would like to fine-tune
-    a model on a SQuAD task, you may leverage the `run_squad.py`.
-    """
-
-    print(nlp(question="What is extractive question answering?", context=context))
-    print(nlp(question="What is a good example of a question answering dataset?", context=context))
-
-This returns an answer extracted from the text, a confidence score, alongside "start" and "end" values which
-are the positions of the extracted answer in the text.
-
-::
-
-    {'score': 0.622232091629833, 'start': 34, 'end': 96, 'answer': 'the task of extracting an answer from a text given a question.'}
-    {'score': 0.5115299158662765, 'start': 147, 'end': 161, 'answer': 'SQuAD dataset,'}
-
-
-Here is an example of question answering using a model and a tokenizer. The process is the following:
-
- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and loads it
-  with the weights stored in the checkpoint.
- Define a text and a few questions.
- Iterate over the questions and build a sequence from the text and the current question, with the correct
-  model-specific separators token type ids and attention masks
- Pass this sequence through the model. This outputs a range of scores across the entire sequence tokens (question and
-  text), for both the start and end positions.
- Compute the softmax of the result to get probabilities over the tokens
- Fetch the tokens from the identified start and stop values, convert those tokens to a string.
- Print the results
-
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoTokenizer, AutoModelForQuestionAnswering
-    import torch
-
-    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-    model = AutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-
-    text = r"""
-    🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
-    architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
-    Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
-    TensorFlow 2.0 and PyTorch.
-    """
-
-    questions = [
-        "How many pretrained models are available in Transformers?",
-        "What does Transformers provide?",
-        "Transformers provides interoperability between which frameworks?",
-    ]
-
-    for question in questions:
-        inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
-        input_ids = inputs["input_ids"].tolist()[0]
-
-        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer_start_scores, answer_end_scores = model(**inputs)
-
-        answer_start = torch.argmax(
-            answer_start_scores
-        )  # Get the most likely beginning of answer with the argmax of the score
-        answer_end = torch.argmax(answer_end_scores) + 1  # Get the most likely end of answer with the argmax of the score
-
-        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
-
-        print(f"Question: {question}")
-        print(f"Answer: {answer}\n")
-    ## TENSORFLOW CODE
-    from transformers import AutoTokenizer, TFAutoModelForQuestionAnswering
-    import tensorflow as tf
-
-    tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-    model = TFAutoModelForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
-
-    text = r"""
-    🤗 Transformers (formerly known as pytorch-transformers and pytorch-pretrained-bert) provides general-purpose
-    architectures (BERT, GPT-2, RoBERTa, XLM, DistilBert, XLNet…) for Natural Language Understanding (NLU) and Natural
-    Language Generation (NLG) with over 32+ pretrained models in 100+ languages and deep interoperability between
-    TensorFlow 2.0 and PyTorch.
-    """
-
-    questions = [
-        "How many pretrained models are available in Transformers?",
-        "What does Transformers provide?",
-        "Transformers provides interoperability between which frameworks?",
-    ]
-
-    for question in questions:
-        inputs = tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="tf")
-        input_ids = inputs["input_ids"].numpy()[0]
-
-        text_tokens = tokenizer.convert_ids_to_tokens(input_ids)
-        answer_start_scores, answer_end_scores = model(inputs)
-
-        answer_start = tf.argmax(
-            answer_start_scores, axis=1
-        ).numpy()[0]  # Get the most likely beginning of answer with the argmax of the score
-        answer_end = (
-            tf.argmax(answer_end_scores, axis=1) + 1
-        ).numpy()[0]  # Get the most likely end of answer with the argmax of the score
-        answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(input_ids[answer_start:answer_end]))
-
-        print(f"Question: {question}")
-        print(f"Answer: {answer}\n")
-
-This outputs the questions followed by the predicted answers:
-
-::
-
-    Question: How many pretrained models are available in Transformers?
-    Answer: over 32 +
-
-    Question: What does Transformers provide?
-    Answer: general - purpose architectures
-
-    Question: Transformers provides interoperability between which frameworks?
-    Answer: tensorflow 2 . 0 and pytorch
-
-
-
-Language Modeling
----------------------------------------------------
-
-Language modeling is the task of fitting a model to a corpus, which can be domain specific. All popular transformer
-based models are trained using a variant of language modeling, e.g. BERT with masked language modeling, GPT-2 with
-causal language modeling.
-
-Language modeling can be useful outside of pre-training as well, for example to shift the model distribution to be
-domain-specific: using a language model trained over a very large corpus, and then fine-tuning it to a news dataset
-or on scientific papers e.g. `LysandreJik/arxiv-nlp <https://huggingface.co/lysandre/arxiv-nlp>`__.
-
-Masked Language Modeling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Masked language modeling is the task of masking tokens in a sequence with a masking token, and prompting the model to
-fill that mask with an appropriate token. This allows the model to attend to both the right context (tokens on the
-right of the mask) and the left context (tokens on the left of the mask). Such a training creates a strong basis
-for downstream tasks requiring bi-directional context such as SQuAD (question answering,
-see `Lewis, Lui, Goyal et al. <https://arxiv.org/abs/1910.13461>`__, part 4.2).
-
-Here is an example of using pipelines to replace a mask from a sequence:
-
-::
-
-    from transformers import pipeline
-
-    nlp = pipeline("fill-mask")
-    print(nlp(f"HuggingFace is creating a {nlp.tokenizer.mask_token} that the community uses to solve NLP tasks."))
-
-This outputs the sequences with the mask filled, the confidence score as well as the token id in the tokenizer
-vocabulary:
-
-::
-
-    [
-        {'sequence': '<s> HuggingFace is creating a tool that the community uses to solve NLP tasks.</s>', 'score': 0.15627853572368622, 'token': 3944},
-        {'sequence': '<s> HuggingFace is creating a framework that the community uses to solve NLP tasks.</s>', 'score': 0.11690319329500198, 'token': 7208},
-        {'sequence': '<s> HuggingFace is creating a library that the community uses to solve NLP tasks.</s>', 'score': 0.058063216507434845, 'token': 5560},
-        {'sequence': '<s> HuggingFace is creating a database that the community uses to solve NLP tasks.</s>', 'score': 0.04211743175983429, 'token': 8503},
-        {'sequence': '<s> HuggingFace is creating a prototype that the community uses to solve NLP tasks.</s>', 'score': 0.024718601256608963, 'token': 17715}
-    ]
-
-Here is an example doing masked language modeling using a model and a tokenizer. The process is the following:
-
- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a DistilBERT model and
-  loads it with the weights stored in the checkpoint.
- Define a sequence with a masked token, placing the :obj:`tokenizer.mask_token` instead of a word.
- Encode that sequence into IDs and find the position of the masked token in that list of IDs.
- Retrieve the predictions at the index of the mask token: this tensor has the same size as the vocabulary, and the
-  values are the scores attributed to each token. The model gives higher score to tokens he deems probable in that
-  context.
- Retrieve the top 5 tokens using the PyTorch :obj:`topk` or TensorFlow :obj:`top_k` methods.
- Replace the mask token by the tokens and print the results
-
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoModelWithLMHead, AutoTokenizer
-    import torch
-
-    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-    model = AutoModelWithLMHead.from_pretrained("distilbert-base-cased")
-
-    sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
-
-    input = tokenizer.encode(sequence, return_tensors="pt")
-    mask_token_index = torch.where(input == tokenizer.mask_token_id)[1]
-
-    token_logits = model(input)[0]
-    mask_token_logits = token_logits[0, mask_token_index, :]
-
-    top_5_tokens = torch.topk(mask_token_logits, 5, dim=1).indices[0].tolist()
-
-    for token in top_5_tokens:
-        print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelWithLMHead, AutoTokenizer
-    import tensorflow as tf
-
-    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-cased")
-    model = TFAutoModelWithLMHead.from_pretrained("distilbert-base-cased")
-
-    sequence = f"Distilled models are smaller than the models they mimic. Using them instead of the large versions would help {tokenizer.mask_token} our carbon footprint."
-
-    input = tokenizer.encode(sequence, return_tensors="tf")
-    mask_token_index = tf.where(input == tokenizer.mask_token_id)[0, 1]
-
-    token_logits = model(input)[0]
-    mask_token_logits = token_logits[0, mask_token_index, :]
-
-    top_5_tokens = tf.math.top_k(mask_token_logits, 5).indices.numpy()
-
-    for token in top_5_tokens:
-        print(sequence.replace(tokenizer.mask_token, tokenizer.decode([token])))
-
-This prints five sequences, with the top 5 tokens predicted by the model:
-
-::
-
-    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help reduce our carbon footprint.
-    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help increase our carbon footprint.
-    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help decrease our carbon footprint.
-    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help offset our carbon footprint.
-    Distilled models are smaller than the models they mimic. Using them instead of the large versions would help improve our carbon footprint.
-
-
-Causal Language Modeling
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Causal language modeling is the task of predicting the token following a sequence of tokens. In this situation, the
-model only attends to the left context (tokens on the left of the mask). Such a training is particularly interesting
-for generation tasks.
-
-Usually, the next token is predicted by sampling from the logits of the last hidden state the model produces from the input sequence.
-
-Here is an example using the tokenizer and model and leveraging the :func:`~transformers.PreTrainedModel.top_k_top_p_filtering` method to sample the next token following an input sequence of tokens.
-
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoModelWithLMHead, AutoTokenizer, top_k_top_p_filtering
-    import torch
-    from torch.nn import functional as F
-
-
-    tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    model = AutoModelWithLMHead.from_pretrained("gpt2")
-
-    sequence = f"Hugging Face is based in DUMBO, New York City, and "
-
-    input_ids = tokenizer.encode(sequence, return_tensors="pt")
-
-    # get logits of last hidden state
-    next_token_logits = model(input_ids)[0][:, -1, :]
-
-    # filter
-    filtered_next_token_logits = top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
-
-    # sample
-    probs = F.softmax(filtered_next_token_logits, dim=-1)
-    next_token = torch.multinomial(probs, num_samples=1)
-
-    generated = torch.cat([input_ids, next_token], dim=-1)
-
-    resulting_string = tokenizer.decode(generated.tolist()[0])
-    print(resulting_string)
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelWithLMHead, AutoTokenizer, tf_top_k_top_p_filtering
-    import tensorflow as tf
-
-    tokenizer = AutoTokenizer.from_pretrained("gpt2")
-    model = TFAutoModelWithLMHead.from_pretrained("gpt2")
-
-    sequence = f"Hugging Face is based in DUMBO, New York City, and "
-
-    input_ids = tokenizer.encode(sequence, return_tensors="tf")
-
-    # get logits of last hidden state
-    next_token_logits = model(input_ids)[0][:, -1, :]
-
-    # filter
-    filtered_next_token_logits = tf_top_k_top_p_filtering(next_token_logits, top_k=50, top_p=1.0)
-
-    # sample
-    next_token = tf.random.categorical(filtered_next_token_logits, dtype=tf.int32, num_samples=1)
-
-    generated = tf.concat([input_ids, next_token], axis=1)
-
-    resulting_string = tokenizer.decode(generated.numpy().tolist()[0])
-    print(resulting_string)
-
-
-This outputs a (hopefully) coherent next token following the original sequence, which is in our case is the word *has*:
-
-::
-
-    Hugging Face is based in DUMBO, New York City, and has
-
-In the next section, we show how this functionality is leveraged in :func:`~transformers.PreTrainedModel.generate` to generate multiple tokens up to a user-defined length.
-
-Text Generation
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-In text generation (*a.k.a* *open-ended text generation*) the goal is to create a coherent portion of text that is a continuation from the given context. As an example, is it shown how *GPT-2* can be used in pipelines to generate text. As a default all models apply *Top-K* sampling when used in pipelines as configured in their respective configurations (see `gpt-2 config <https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json>`_ for example).
-
-::
-
-    from transformers import pipeline
-
-    text_generator = pipeline("text-generation")
-    print(text_generator("As far as I am concerned, I will", max_length=50))
-
-
-Here the model generates a random text with a total maximal length of *50* tokens from context *"As far as I am concerned, I will"*.
-The default arguments of ``PreTrainedModel.generate()`` can directly be overriden in the pipeline as is shown above for the argument ``max_length``.
-
-Here is an example for text generation using XLNet and its tokenzier. 
-
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoModelWithLMHead, AutoTokenizer
-
-    model = AutoModelWithLMHead.from_pretrained("xlnet-base-cased")
-    tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
-
-    # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
-    PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
-    (except for Alexei and Maria) are discovered.
-    The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
-    remainder of the story. 1883 Western Siberia,
-    a young Grigori Rasputin is asked by his father and a group of men to perform magic.
-    Rasputin has a vision and denounces one of the men as a horse thief. Although his
-    father initially slaps him for making such an accusation, Rasputin watches as the
-    man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
-    with people, even a bishop, begging for his blessing. <eod> </s> <eos>""" 
-
-    prompt = "Today the weather is really nice and I am planning on "
-    inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="pt")
-    
-    prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
-    outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
-    generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
-
-    print(generated)
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelWithLMHead, AutoTokenizer
-
-    model = TFAutoModelWithLMHead.from_pretrained("xlnet-base-cased")
-    tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
-
-    # Padding text helps XLNet with short prompts - proposed by Aman Rusia in https://github.com/rusiaaman/XLNet-gen#methodology
-    PADDING_TEXT = """In 1991, the remains of Russian Tsar Nicholas II and his family
-    (except for Alexei and Maria) are discovered.
-    The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
-    remainder of the story. 1883 Western Siberia,
-    a young Grigori Rasputin is asked by his father and a group of men to perform magic.
-    Rasputin has a vision and denounces one of the men as a horse thief. Although his
-    father initially slaps him for making such an accusation, Rasputin watches as the
-    man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
-    the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
-    with people, even a bishop, begging for his blessing. <eod> </s> <eos>""" 
-
-    prompt = "Today the weather is really nice and I am planning on "
-    inputs = tokenizer.encode(PADDING_TEXT + prompt, add_special_tokens=False, return_tensors="tf")
-
-    prompt_length = len(tokenizer.decode(inputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True))
-    outputs = model.generate(inputs, max_length=250, do_sample=True, top_p=0.95, top_k=60)
-    generated = prompt + tokenizer.decode(outputs[0])[prompt_length:]
-
-    print(generated)
-
-Text generation is currently possible with *GPT-2*, *OpenAi-GPT*, *CTRL*, *XLNet*, *Transfo-XL* and *Reformer* in PyTorch and for most models in Tensorflow as well. As can be seen in the example above *XLNet* and *Transfo-xl* often need to be padded to work well.
-GPT-2 is usually a good choice for *open-ended text generation* because it was trained on millions on webpages with a causal language modeling objective.
-
-For more information on how to apply different decoding strategies for text generation, please also refer to our generation blog post `here <https://huggingface.co/blog/how-to-generate>`_.
-
-
-Named Entity Recognition
----------------------------------------------------
-
-Named Entity Recognition (NER) is the task of classifying tokens according to a class, for example identifying a
-token as a person, an organisation or a location.
-An example of a named entity recognition dataset is the CoNLL-2003 dataset, which is entirely based on that task.
-If you would like to fine-tune a model on an NER task, you may leverage the `ner/run_ner.py` (PyTorch),
-`ner/run_pl_ner.py` (leveraging pytorch-lightning) or the `ner/run_tf_ner.py` (TensorFlow) scripts.
-
-Here is an example using the pipelines do to named entity recognition, trying to identify tokens as belonging to one
-of 9 classes:
-
- O, Outside of a named entity
- B-MIS, Beginning of a miscellaneous entity right after another miscellaneous entity
- I-MIS, Miscellaneous entity
- B-PER, Beginning of a person's name right after another person's name
- I-PER, Person's name
- B-ORG, Beginning of an organisation right after another organisation
- I-ORG, Organisation
- B-LOC, Beginning of a location right after another location
- I-LOC, Location
-
-It leverages a fine-tuned model on CoNLL-2003, fine-tuned by `@stefan-it <https://github.com/stefan-it>`__ from
-`dbmdz <https://github.com/dbmdz>`__.
-
-::
-
-    from transformers import pipeline
-
-    nlp = pipeline("ner")
-
-    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-               "close to the Manhattan Bridge which is visible from the window."
-
-    print(nlp(sequence))
-
-This outputs a list of all words that have been identified as an entity from the 9 classes defined above. Here is the
-expected results:
-
-::
-
-    [
-        {'word': 'Hu', 'score': 0.9995632767677307, 'entity': 'I-ORG'},
-        {'word': '##gging', 'score': 0.9915938973426819, 'entity': 'I-ORG'},
-        {'word': 'Face', 'score': 0.9982671737670898, 'entity': 'I-ORG'},
-        {'word': 'Inc', 'score': 0.9994403719902039, 'entity': 'I-ORG'},
-        {'word': 'New', 'score': 0.9994346499443054, 'entity': 'I-LOC'},
-        {'word': 'York', 'score': 0.9993270635604858, 'entity': 'I-LOC'},
-        {'word': 'City', 'score': 0.9993864893913269, 'entity': 'I-LOC'},
-        {'word': 'D', 'score': 0.9825621843338013, 'entity': 'I-LOC'},
-        {'word': '##UM', 'score': 0.936983048915863, 'entity': 'I-LOC'},
-        {'word': '##BO', 'score': 0.8987102508544922, 'entity': 'I-LOC'},
-        {'word': 'Manhattan', 'score': 0.9758241176605225, 'entity': 'I-LOC'},
-        {'word': 'Bridge', 'score': 0.990249514579773, 'entity': 'I-LOC'}
-    ]
-
-Note how the words "Hugging Face" have been identified as an organisation, and "New York City", "DUMBO" and
-"Manhattan Bridge" have been identified as locations.
-
-Here is an example doing named entity recognition using a model and a tokenizer. The process is the following:
-
- Instantiate a tokenizer and a model from the checkpoint name. The model is identified as a BERT model and
-  loads it with the weights stored in the checkpoint.
- Define the label list with which the model was trained on.
- Define a sequence with known entities, such as "Hugging Face" as an organisation and "New York City" as a location.
- Split words into tokens so that they can be mapped to the predictions. We use a small hack by firstly completely
-  encoding and decoding the sequence, so that we're left with a string that contains the special tokens.
- Encode that sequence into IDs (special tokens are added automatically).
- Retrieve the predictions by passing the input to the model and getting the first output. This results in a
-  distribution over the 9 possible classes for each token. We take the argmax to retrieve the most likely class
-  for each token.
- Zip together each token with its prediction and print it.
-
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoModelForTokenClassification, AutoTokenizer
-    import torch
-
-    model = AutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-
-    label_list = [
-        "O",       # Outside of a named entity
-        "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
-        "I-MISC",  # Miscellaneous entity
-        "B-PER",   # Beginning of a person's name right after another person's name
-        "I-PER",   # Person's name
-        "B-ORG",   # Beginning of an organisation right after another organisation
-        "I-ORG",   # Organisation
-        "B-LOC",   # Beginning of a location right after another location
-        "I-LOC"    # Location
-    ]
-
-    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-               "close to the Manhattan Bridge."
-
-    # Bit of a hack to get the tokens with the special tokens
-    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
-    inputs = tokenizer.encode(sequence, return_tensors="pt")
-
-    outputs = model(inputs)[0]
-    predictions = torch.argmax(outputs, dim=2)
-
-    print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].tolist())])
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelForTokenClassification, AutoTokenizer
-    import tensorflow as tf
-
-    model = TFAutoModelForTokenClassification.from_pretrained("dbmdz/bert-large-cased-finetuned-conll03-english")
-    tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
-
-    label_list = [
-        "O",       # Outside of a named entity
-        "B-MISC",  # Beginning of a miscellaneous entity right after another miscellaneous entity
-        "I-MISC",  # Miscellaneous entity
-        "B-PER",   # Beginning of a person's name right after another person's name
-        "I-PER",   # Person's name
-        "B-ORG",   # Beginning of an organisation right after another organisation
-        "I-ORG",   # Organisation
-        "B-LOC",   # Beginning of a location right after another location
-        "I-LOC"    # Location
-    ]
-
-    sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
-               "close to the Manhattan Bridge."
-
-    # Bit of a hack to get the tokens with the special tokens
-    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(sequence)))
-    inputs = tokenizer.encode(sequence, return_tensors="tf")
-
-    outputs = model(inputs)[0]
-    predictions = tf.argmax(outputs, axis=2)
-
-    print([(token, label_list[prediction]) for token, prediction in zip(tokens, predictions[0].numpy())])
-
-This outputs a list of each token mapped to their prediction. Differently from the pipeline, here every token has
-a prediction as we didn't remove the "0" class which means that no particular entity was found on that token. The
-following array should be the output:
-
-::
-
-    [('[CLS]', 'O'), ('Hu', 'I-ORG'), ('##gging', 'I-ORG'), ('Face', 'I-ORG'), ('Inc', 'I-ORG'), ('.', 'O'), ('is', 'O'), ('a', 'O'), ('company', 'O'), ('based', 'O'), ('in', 'O'), ('New', 'I-LOC'), ('York', 'I-LOC'), ('City', 'I-LOC'), ('.', 'O'), ('Its', 'O'), ('headquarters', 'O'), ('are', 'O'), ('in', 'O'), ('D', 'I-LOC'), ('##UM', 'I-LOC'), ('##BO', 'I-LOC'), (',', 'O'), ('therefore', 'O'), ('very', 'O'), ('##c', 'O'), ('##lose', 'O'), ('to', 'O'), ('the', 'O'), ('Manhattan', 'I-LOC'), ('Bridge', 'I-LOC'), ('.', 'O'), ('[SEP]', 'O')]   
-Summarization
----------------------------------------------------
-
-Summarization is the task of summarizing a text / an article into a shorter text.
-
-An example of a summarization dataset is the CNN / Daily Mail dataset, which consists of long news articles and was created for the task of summarization.
-If you would like to fine-tune a model on a summarization task, you may leverage the ``examples/summarization/bart/run_train.sh`` (leveraging pytorch-lightning) script.
-
-Here is an example using the pipelines do to summarization. 
-It leverages a Bart model that was fine-tuned on the CNN / Daily Mail data set.
-
-::
-
-    from transformers import pipeline
-
-    summarizer = pipeline("summarization")
-
-    ARTICLE = """ New York (CNN)When Liana Barrientos was 23 years old, she got married in Westchester County, New York. 
-    A year later, she got married again in Westchester County, but to a different man and without divorcing her first husband. 
-    Only 18 days after that marriage, she got hitched yet again. Then, Barrientos declared "I do" five more times, sometimes only within two weeks of each other. 
-    In 2010, she married once more, this time in the Bronx. In an application for a marriage license, she stated it was her "first and only" marriage. 
-    Barrientos, now 39, is facing two criminal counts of "offering a false instrument for filing in the first degree," referring to her false statements on the 
-    2010 marriage license application, according to court documents. 
-    Prosecutors said the marriages were part of an immigration scam. 
-    On Friday, she pleaded not guilty at State Supreme Court in the Bronx, according to her attorney, Christopher Wright, who declined to comment further. 
-    After leaving court, Barrientos was arrested and charged with theft of service and criminal trespass for allegedly sneaking into the New York subway through an emergency exit, said Detective 
-    Annette Markowski, a police spokeswoman. In total, Barrientos has been married 10 times, with nine of her marriages occurring between 1999 and 2002. 
-    All occurred either in Westchester County, Long Island, New Jersey or the Bronx. She is believed to still be married to four men, and at one time, she was married to eight men at once, prosecutors say. 
-    Prosecutors said the immigration scam involved some of her husbands, who filed for permanent residence status shortly after the marriages. 
-    Any divorces happened only after such filings were approved. It was unclear whether any of the men will be prosecuted. 
-    The case was referred to the Bronx District Attorney\'s Office by Immigration and Customs Enforcement and the Department of Homeland Security\'s 
-    Investigation Division. Seven of the men are from so-called "red-flagged" countries, including Egypt, Turkey, Georgia, Pakistan and Mali. 
-    Her eighth husband, Rashid Rajput, was deported in 2006 to his native Pakistan after an investigation by the Joint Terrorism Task Force. 
-    If convicted, Barrientos faces up to four years in prison.  Her next court appearance is scheduled for May 18.
-    """
-    
-    print(summarizer(ARTICLE, max_length=130, min_length=30))
-
-Because the summarization pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments 
-of ``PretrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` and ``min_length`` above.
-This outputs the following summary:
-
-::
-
-  Liana Barrientos has been married 10 times, sometimes within two weeks of each other. Prosecutors say the marriages were part of an immigration scam. She pleaded not guilty at State Supreme Court in the Bronx on Friday.
-  
-Here is an example doing summarization using a model and a tokenizer. The process is the following:
-
- Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``.
- Define the article that should be summarizaed.
- Leverage the ``PretrainedModel.generate()`` method.
- Add the T5 specific prefix "summarize: ".
-
-Here Google`s T5 model is used that was only pre-trained on a multi-task mixed data set (including CNN / Daily Mail), but nevertheless yields very good results.
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoModelWithLMHead, AutoTokenizer
-
-    model = AutoModelWithLMHead.from_pretrained("t5-base")
-    tokenizer = AutoTokenizer.from_pretrained("t5-base")
-
-    # T5 uses a max_length of 512 so we cut the article to 512 tokens.
-    inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="pt", max_length=512)
-    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
-    print(outputs)
-    
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelWithLMHead, AutoTokenizer
-
-    model = TFAutoModelWithLMHead.from_pretrained("t5-base")
-    tokenizer = AutoTokenizer.from_pretrained("t5-base")
-
-    # T5 uses a max_length of 512 so we cut the article to 512 tokens.
-    inputs = tokenizer.encode("summarize: " + ARTICLE, return_tensors="tf", max_length=512)
-    outputs = model.generate(inputs, max_length=150, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
-    print(outputs)  
-Translation
----------------------------------------------------
-
-Translation is the task of translating a text from one language to another.
-
-An example of a translation dataset is the WMT English to German dataset, which has English sentences as the input data 
-and German sentences as the target data.
-
-Here is an example using the pipelines do to translation. 
-It leverages a T5 model that was only pre-trained on a multi-task mixture dataset (including WMT), but yields impressive 
-translation results nevertheless.
-
-::
-
-    from transformers import pipeline
-
-    translator = pipeline("translation_en_to_de")
-    print(translator("Hugging Face is a technology company based in New York and Paris", max_length=40))
-
-Because the translation pipeline depends on the ``PretrainedModel.generate()`` method, we can override the default arguments 
-of ``PretrainedModel.generate()`` directly in the pipeline as is shown for ``max_length`` above.
-This outputs the following translation into German:
-
-::
-
-  Hugging Face ist ein Technologieunternehmen mit Sitz in New York und Paris.
-  
-Here is an example doing translation using a model and a tokenizer. The process is the following:
-
- Instantiate a tokenizer and a model from the checkpoint name. Summarization is usually done using an encoder-decoder model, such as ``Bart`` or ``T5``.
- Define the article that should be summarizaed.
- Leverage the ``PretrainedModel.generate()`` method.
- Add the T5 specific prefix "translate English to German: "
-
-::
-
-    ## PYTORCH CODE
-    from transformers import AutoModelWithLMHead, AutoTokenizer
-
-    model = AutoModelWithLMHead.from_pretrained("t5-base")
-    tokenizer = AutoTokenizer.from_pretrained("t5-base")
-
-    inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="pt")
-    outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
-
-    print(outputs)
-    
-    ## TENSORFLOW CODE
-    from transformers import TFAutoModelWithLMHead, AutoTokenizer
-
-    model = TFAutoModelWithLMHead.from_pretrained("t5-base")
-    tokenizer = AutoTokenizer.from_pretrained("t5-base")
-
-    inputs = tokenizer.encode("translate English to German: Hugging Face is a technology company based in New York and Paris", return_tensors="tf")
-    outputs = model.generate(inputs, max_length=40, num_beams=4, early_stopping=True)
-
-    print(outputs)
--- a/examples/README.md
+++ b/examples/README.md
@@ -1,7 +1,7 @@
-## Examples
+# Examples

-Version 2.9 of `transformers` introduces a new [`Trainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) class for PyTorch, and its equivalent [`TFTrainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer_tf.py) for TF 2.
-Running the examples requires PyTorch 1.3.1+ or TensorFlow 2.0+.
+Version 2.9 of 🤗 Transformers introduces a new [`Trainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer.py) class for PyTorch, and its equivalent [`TFTrainer`](https://github.com/huggingface/transformers/blob/master/src/transformers/trainer_tf.py) for TF 2.
+Running the examples requires PyTorch 1.3.1+ or TensorFlow 2.2+.

 Here is the list of all our examples:
 - **grouped by task** (all official examples work for multiple models)
@@ -13,7 +13,7 @@ Here is the list of all our examples:
 This is still a work-in-progress – in particular documentation is still sparse – so please **contribute improvements/pull requests.**


-# The Big Table of Tasks
+## The Big Table of Tasks

 | Task | Example datasets | Trainer support | TFTrainer support | pytorch-lightning | Colab
 |---|---|:---:|:---:|:---:|:---:|
@@ -21,13 +21,13 @@ This is still a work-in-progress – in particular documentation is still sparse
 | [**`text-classification`**](https://github.com/huggingface/transformers/tree/master/examples/text-classification)   | GLUE, XNLI      | ✅ | ✅ | ✅ | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/trainer/01_text_classification.ipynb)
 | [**`token-classification`**](https://github.com/huggingface/transformers/tree/master/examples/token-classification) | CoNLL NER       | ✅ | ✅ | ✅ | -
 | [**`multiple-choice`**](https://github.com/huggingface/transformers/tree/master/examples/multiple-choice)           | SWAG, RACE, ARC | ✅ | ✅ | -  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/ViktorAlm/notebooks/blob/master/MPC_GPU_Demo_for_TF_and_PT.ipynb)
-| [**`question-answering`**](https://github.com/huggingface/transformers/tree/master/examples/question-answering)     | SQuAD           | -  | ✅ | -  | -
-| [**`text-generation`**](https://github.com/huggingface/transformers/tree/master/examples/text-generation)     | -           | -  | - | -  | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)
+| [**`question-answering`**](https://github.com/huggingface/transformers/tree/master/examples/question-answering)     | SQuAD           | ✅ | ✅ | -  | -
+| [**`text-generation`**](https://github.com/huggingface/transformers/tree/master/examples/text-generation)           | -               | n/a | n/a | n/a | [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/huggingface/blog/blob/master/notebooks/02_how_to_generate.ipynb)
 | [**`distillation`**](https://github.com/huggingface/transformers/tree/master/examples/distillation)       | All               | -  | -  | -  | -
-| [**`summarization`**](https://github.com/huggingface/transformers/tree/master/examples/summarization)     | CNN/Daily Mail    | -  | -  | -  | -
-| [**`translation`**](https://github.com/huggingface/transformers/tree/master/examples/translation)         | WMT               | -  | -  | -  | -
+| [**`summarization`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)     | CNN/Daily Mail    | -  | -  | ✅  | -
+| [**`translation`**](https://github.com/huggingface/transformers/tree/master/examples/seq2seq)         | WMT               | -  | -  | ✅  | -
 | [**`bertology`**](https://github.com/huggingface/transformers/tree/master/examples/bertology)             | -                 | -  | -  | -  | -
-| [**`adversarial`**](https://github.com/huggingface/transformers/tree/master/examples/adversarial)         | HANS              | -  | -  | -  | -
+| [**`adversarial`**](https://github.com/huggingface/transformers/tree/master/examples/adversarial)         | HANS              | ✅ | -  | -  | -


 <br>
@@ -78,3 +78,50 @@ python examples/xla_spawn.py --num_cores 8 \
 ```

 Feedback and more use cases and benchmarks involving TPUs are welcome, please share with the community.
+
+## Logging & Experiment tracking
+
+You can easily log and monitor your runs code. The following are currently supported:
+
+* [TensorBoard](https://www.tensorflow.org/tensorboard)
+* [Weights & Biases](https://docs.wandb.com/library/integrations/huggingface)
+* [Comet ML](https://www.comet.ml/docs/python-sdk/huggingface/)
+
+### Weights & Biases
+
+To use Weights & Biases, install the wandb package with:
+
+```bash
+pip install wandb
+```
+
+Then log in the command line:
+
+```bash
+wandb login
+```
+
+If you are in Jupyter or Colab, you should login with:
+
+```python
+import wandb
+wandb.login()
+```
+
+Whenever you use `Trainer` or `TFTrainer` classes, your losses, evaluation metrics, model topology and gradients (for `Trainer` only) will automatically be logged.
+
+When using 🤗 Transformers with PyTorch Lightning, runs can be tracked through `WandbLogger`. Refer to related [documentation & examples](https://docs.wandb.com/library/integrations/lightning).
+
+### Comet.ml
+
+To use `comet_ml`, install the Python package with:
+
+```bash
+pip install comet_ml
+```
+
+or if in a Conda environment:
+
+```bash
+conda install -c comet_ml -c anaconda -c conda-forge comet_ml
+```
--- a/examples/adversarial/README.md
+++ b/examples/adversarial/README.md
@@ -11,7 +11,7 @@ export HANS_DIR=path-to-hans
 export MODEL_TYPE=type-of-the-model-e.g.-bert-roberta-xlnet-etc
 export MODEL_PATH=path-to-the-model-directory-that-is-trained-on-NLI-e.g.-by-using-run_glue.py

-python examples/hans/test_hans.py \
+python run_hans.py \
        --task_name hans \
        --model_type $MODEL_TYPE \
        --do_eval \
--- a/examples/adversarial/hans_processors.py
+++ b/examples/adversarial/hans_processors.py
@@ -1,221 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" GLUE processors and helpers """
-
-import logging
-import os
-
-from transformers.file_utils import is_tf_available
-from utils_hans import DataProcessor, InputExample, InputFeatures
-
-
-if is_tf_available():
-    import tensorflow as tf
-
-logger = logging.getLogger(__name__)
-
-
-def hans_convert_examples_to_features(
-    examples,
-    tokenizer,
-    max_length=512,
-    task=None,
-    label_list=None,
-    output_mode=None,
-    pad_on_left=False,
-    pad_token=0,
-    pad_token_segment_id=0,
-    mask_padding_with_zero=True,
-):
-    """
-    Loads a data file into a list of ``InputFeatures``
-
-    Args:
-        examples: List of ``InputExamples`` or ``tf.data.Dataset`` containing the examples.
-        tokenizer: Instance of a tokenizer that will tokenize the examples
-        max_length: Maximum example length
-        task: HANS
-        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method
-        output_mode: String indicating the output mode. Either ``regression`` or ``classification``
-        pad_on_left: If set to ``True``, the examples will be padded on the left rather than on the right (default)
-        pad_token: Padding token
-        pad_token_segment_id: The segment ID for the padding token (It is usually 0, but can vary such as for XLNet where it is 4)
-        mask_padding_with_zero: If set to ``True``, the attention mask will be filled by ``1`` for actual values
-            and by ``0`` for padded values. If set to ``False``, inverts it (``1`` for padded values, ``0`` for
-            actual values)
-
-    Returns:
-        If the ``examples`` input is a ``tf.data.Dataset``, will return a ``tf.data.Dataset``
-        containing the task-specific features. If the input is a list of ``InputExamples``, will return
-        a list of task-specific ``InputFeatures`` which can be fed to the model.
-
-    """
-    is_tf_dataset = False
-    if is_tf_available() and isinstance(examples, tf.data.Dataset):
-        is_tf_dataset = True
-
-    if task is not None:
-        processor = glue_processors[task]()
-        if label_list is None:
-            label_list = processor.get_labels()
-            logger.info("Using label list %s for task %s" % (label_list, task))
-        if output_mode is None:
-            output_mode = glue_output_modes[task]
-            logger.info("Using output mode %s for task %s" % (output_mode, task))
-
-    label_map = {label: i for i, label in enumerate(label_list)}
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        if ex_index % 10000 == 0:
-            logger.info("Writing example %d" % (ex_index))
-        if is_tf_dataset:
-            example = processor.get_example_from_tensor_dict(example)
-            example = processor.tfds_map(example)
-
-        inputs = tokenizer.encode_plus(example.text_a, example.text_b, add_special_tokens=True, max_length=max_length,)
-        input_ids, token_type_ids = inputs["input_ids"], inputs["token_type_ids"]
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        attention_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        padding_length = max_length - len(input_ids)
-        if pad_on_left:
-            input_ids = ([pad_token] * padding_length) + input_ids
-            attention_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + attention_mask
-            token_type_ids = ([pad_token_segment_id] * padding_length) + token_type_ids
-        else:
-            input_ids = input_ids + ([pad_token] * padding_length)
-            attention_mask = attention_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
-            token_type_ids = token_type_ids + ([pad_token_segment_id] * padding_length)
-
-        assert len(input_ids) == max_length, "Error with input length {} vs {}".format(len(input_ids), max_length)
-        assert len(attention_mask) == max_length, "Error with input length {} vs {}".format(
-            len(attention_mask), max_length
-        )
-        assert len(token_type_ids) == max_length, "Error with input length {} vs {}".format(
-            len(token_type_ids), max_length
-        )
-
-        if output_mode == "classification":
-            label = label_map[example.label] if example.label in label_map else 0
-        elif output_mode == "regression":
-            label = float(example.label)
-        else:
-            raise KeyError(output_mode)
-        pairID = str(example.pairID)
-
-        if ex_index < 10:
-            logger.info("*** Example ***")
-            logger.info("text_a: %s" % (example.text_a))
-            logger.info("text_b: %s" % (example.text_b))
-            logger.info("guid: %s" % (example.guid))
-            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            logger.info("attention_mask: %s" % " ".join([str(x) for x in attention_mask]))
-            logger.info("token_type_ids: %s" % " ".join([str(x) for x in token_type_ids]))
-            logger.info("label: %s (id = %d)" % (example.label, label))
-
-        features.append(
-            InputFeatures(
-                input_ids=input_ids,
-                attention_mask=attention_mask,
-                token_type_ids=token_type_ids,
-                label=label,
-                pairID=pairID,
-            )
-        )
-
-    if is_tf_available() and is_tf_dataset:
-
-        def gen():
-            for ex in features:
-                yield (
-                    {
-                        "input_ids": ex.input_ids,
-                        "attention_mask": ex.attention_mask,
-                        "token_type_ids": ex.token_type_ids,
-                    },
-                    ex.label,
-                )
-
-        return tf.data.Dataset.from_generator(
-            gen,
-            ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
-            (
-                {
-                    "input_ids": tf.TensorShape([None]),
-                    "attention_mask": tf.TensorShape([None]),
-                    "token_type_ids": tf.TensorShape([None]),
-                },
-                tf.TensorShape([]),
-            ),
-        )
-
-    return features
-
-
-class HansProcessor(DataProcessor):
-    """Processor for the HANS data set."""
-
-    def get_example_from_tensor_dict(self, tensor_dict):
-        """See base class."""
-        return InputExample(
-            tensor_dict["idx"].numpy(),
-            tensor_dict["premise"].numpy().decode("utf-8"),
-            tensor_dict["hypothesis"].numpy().decode("utf-8"),
-            str(tensor_dict["label"].numpy()),
-        )
-
-    def get_train_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")
-
-    def get_dev_examples(self, data_dir):
-        """See base class."""
-        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev")
-
-    def get_labels(self):
-        """See base class."""
-        return ["contradiction", "entailment", "neutral"]
-
-    def _create_examples(self, lines, set_type):
-        """Creates examples for the training and dev sets."""
-        examples = []
-        for (i, line) in enumerate(lines):
-            if i == 0:
-                continue
-            guid = "%s-%s" % (set_type, line[0])
-            text_a = line[5]
-            text_b = line[6]
-            pairID = line[7][2:] if line[7].startswith("ex") else line[7]
-            label = line[-1]
-            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
-        return examples
-
-
-glue_tasks_num_labels = {
-    "hans": 3,
-}
-
-glue_processors = {
-    "hans": HansProcessor,
-}
-
-glue_output_modes = {
-    "hans": "classification",
-}
--- a/examples/adversarial/run_hans.py
+++ b/examples/adversarial/run_hans.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for sequence classification on HANS."""
+
+import logging
+import os
+from dataclasses import dataclass, field
+from typing import Dict, List, Optional
+
+import numpy as np
+import torch
+
+from transformers import (
+    AutoConfig,
+    AutoModelForSequenceClassification,
+    AutoTokenizer,
+    HfArgumentParser,
+    Trainer,
+    TrainingArguments,
+    default_data_collator,
+    set_seed,
+)
+from utils_hans import HansDataset, InputFeatures, hans_processors, hans_tasks_num_labels
+
+
+logger = logging.getLogger(__name__)
+
+
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune from.
+    """
+
+    model_name_or_path: str = field(
+        metadata={"help": "Path to pretrained model or model identifier from huggingface.co/models"}
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+
+
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+
+    task_name: str = field(
+        metadata={"help": "The name of the task to train selected in the list: " + ", ".join(hans_processors.keys())}
+    )
+    data_dir: str = field(
+        metadata={"help": "The input data dir. Should contain the .tsv files (or other data files) for the task."}
+    )
+    max_seq_length: int = field(
+        default=128,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated, sequences shorter will be padded."
+        },
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+
+
+def hans_data_collator(features: List[InputFeatures]) -> Dict[str, torch.Tensor]:
+    """
+    Data collator that removes the "pairID" key if present.
+    """
+    batch = default_data_collator(features)
+    _ = batch.pop("pairID", None)
+    return batch
+
+
+def main():
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty. Use --overwrite_output_dir to overcome."
+        )
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        training_args.local_rank,
+        training_args.device,
+        training_args.n_gpu,
+        bool(training_args.local_rank != -1),
+        training_args.fp16,
+    )
+    logger.info("Training/evaluation parameters %s", training_args)
+
+    # Set seed
+    set_seed(training_args.seed)
+
+    try:
+        num_labels = hans_tasks_num_labels[data_args.task_name]
+    except KeyError:
+        raise ValueError("Task not found: %s" % (data_args.task_name))
+
+    # Load pretrained model and tokenizer
+    #
+    # Distributed training:
+    # The .from_pretrained methods guarantee that only one local process can concurrently
+    # download model & vocab.
+
+    config = AutoConfig.from_pretrained(
+        model_args.config_name if model_args.config_name else model_args.model_name_or_path,
+        num_labels=num_labels,
+        finetuning_task=data_args.task_name,
+        cache_dir=model_args.cache_dir,
+    )
+    tokenizer = AutoTokenizer.from_pretrained(
+        model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path,
+        cache_dir=model_args.cache_dir,
+    )
+    model = AutoModelForSequenceClassification.from_pretrained(
+        model_args.model_name_or_path,
+        from_tf=bool(".ckpt" in model_args.model_name_or_path),
+        config=config,
+        cache_dir=model_args.cache_dir,
+    )
+
+    # Get datasets
+    train_dataset = (
+        HansDataset(
+            data_dir=data_args.data_dir,
+            tokenizer=tokenizer,
+            task=data_args.task_name,
+            max_seq_length=data_args.max_seq_length,
+            overwrite_cache=data_args.overwrite_cache,
+        )
+        if training_args.do_train
+        else None
+    )
+    eval_dataset = (
+        HansDataset(
+            data_dir=data_args.data_dir,
+            tokenizer=tokenizer,
+            task=data_args.task_name,
+            max_seq_length=data_args.max_seq_length,
+            overwrite_cache=data_args.overwrite_cache,
+            evaluate=True,
+        )
+        if training_args.do_eval
+        else None
+    )
+
+    # Initialize our Trainer
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        data_collator=hans_data_collator,
+    )
+
+    # Training
+    if training_args.do_train:
+        trainer.train(
+            model_path=model_args.model_name_or_path if os.path.isdir(model_args.model_name_or_path) else None
+        )
+        trainer.save_model()
+        # For convenience, we also re-save the tokenizer to the same directory,
+        # so that you can share your model easily on huggingface.co/models =)
+        if trainer.is_world_master():
+            tokenizer.save_pretrained(training_args.output_dir)
+
+    # Evaluation
+    if training_args.do_eval:
+        logger.info("*** Evaluate ***")
+
+        output = trainer.predict(eval_dataset)
+        preds = output.predictions
+        preds = np.argmax(preds, axis=1)
+
+        pair_ids = [ex.pairID for ex in eval_dataset]
+        output_eval_file = os.path.join(training_args.output_dir, "hans_predictions.txt")
+        label_list = eval_dataset.get_labels()
+        if trainer.is_world_master():
+            with open(output_eval_file, "w") as writer:
+                writer.write("pairID,gold_label\n")
+                for pid, pred in zip(pair_ids, preds):
+                    writer.write("ex" + str(pid) + "," + label_list[int(pred)] + "\n")
+
+        trainer._log(output.metrics)
+
+
+def _mp_fn(index):
+    # For xla_spawn (TPUs)
+    main()
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/adversarial/utils_hans.py
+++ b/examples/adversarial/utils_hans.py
@@ -14,108 +14,328 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import copy
-import csv
-import json
+import logging
+import os
+from dataclasses import dataclass
+from typing import List, Optional, Union
+
+import tqdm
+
+from filelock import FileLock
+from transformers import (
+    BartTokenizer,
+    BartTokenizerFast,
+    DataProcessor,
+    PreTrainedTokenizer,
+    RobertaTokenizer,
+    RobertaTokenizerFast,
+    XLMRobertaTokenizer,
+    is_tf_available,
+    is_torch_available,
+)


-class InputExample(object):
+logger = logging.getLogger(__name__)
+
+
+@dataclass(frozen=True)
+class InputExample:
    """
    A single training/test example for simple sequence classification.

    Args:
        guid: Unique id for the example.
        text_a: string. The untokenized text of the first sequence. For single
-        sequence tasks, only this sequence must be specified.
+            sequence tasks, only this sequence must be specified.
        text_b: (Optional) string. The untokenized text of the second sequence.
-        Only must be specified for sequence pair tasks.
+            Only must be specified for sequence pair tasks.
        label: (Optional) string. The label of the example. This should be
-        specified for train and dev examples, but not for test examples.
+            specified for train and dev examples, but not for test examples.
+        pairID: (Optional) string. Unique identifier for the pair of sentences.
    """

-    def __init__(self, guid, text_a, text_b=None, label=None, pairID=None):
-        self.guid = guid
-        self.text_a = text_a
-        self.text_b = text_b
-        self.label = label
-        self.pairID = pairID
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+    guid: str
+    text_a: str
+    text_b: Optional[str] = None
+    label: Optional[str] = None
+    pairID: Optional[str] = None


-class InputFeatures(object):
+@dataclass(frozen=True)
+class InputFeatures:
    """
    A single set of features of data.
+    Property names are the same names as the corresponding inputs to a model.

    Args:
        input_ids: Indices of input sequence tokens in the vocabulary.
        attention_mask: Mask to avoid performing attention on padding token indices.
            Mask values selected in ``[0, 1]``:
            Usually  ``1`` for tokens that are NOT MASKED, ``0`` for MASKED (padded) tokens.
-        token_type_ids: Segment token indices to indicate first and second portions of the inputs.
-        label: Label corresponding to the input
+        token_type_ids: (Optional) Segment token indices to indicate first and second
+            portions of the inputs. Only some models use them.
+        label: (Optional) Label corresponding to the input. Int for classification problems,
+            float for regression problems.
+        pairID: (Optional) Unique identifier for the pair of sentences.
    """

-    def __init__(self, input_ids, attention_mask, token_type_ids, label, pairID=None):
-        self.input_ids = input_ids
-        self.attention_mask = attention_mask
-        self.token_type_ids = token_type_ids
-        self.label = label
-        self.pairID = pairID
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+    input_ids: List[int]
+    attention_mask: Optional[List[int]] = None
+    token_type_ids: Optional[List[int]] = None
+    label: Optional[Union[int, float]] = None
+    pairID: Optional[int] = None


-class DataProcessor(object):
-    """Base class for data converters for sequence classification data sets."""
+if is_torch_available():
+    import torch
+    from torch.utils.data.dataset import Dataset

-    def get_example_from_tensor_dict(self, tensor_dict):
-        """Gets an example from a dict with tensorflow tensors
-
-        Args:
-            tensor_dict: Keys and values should match the corresponding Glue
-                tensorflow_dataset examples.
+    class HansDataset(Dataset):
        """
-        raise NotImplementedError()
+        This will be superseded by a framework-agnostic approach
+        soon.
+        """
+
+        features: List[InputFeatures]
+
+        def __init__(
+            self,
+            data_dir: str,
+            tokenizer: PreTrainedTokenizer,
+            task: str,
+            max_seq_length: Optional[int] = None,
+            overwrite_cache=False,
+            evaluate: bool = False,
+        ):
+            processor = hans_processors[task]()
+
+            cached_features_file = os.path.join(
+                data_dir,
+                "cached_{}_{}_{}_{}".format(
+                    "dev" if evaluate else "train",
+                    tokenizer.__class__.__name__,
+                    str(max_seq_length),
+                    task,
+                ),
+            )
+            label_list = processor.get_labels()
+            if tokenizer.__class__ in (
+                RobertaTokenizer,
+                RobertaTokenizerFast,
+                XLMRobertaTokenizer,
+                BartTokenizer,
+                BartTokenizerFast,
+            ):
+                # HACK(label indices are swapped in RoBERTa pretrained model)
+                label_list[1], label_list[2] = label_list[2], label_list[1]
+            self.label_list = label_list
+
+            # Make sure only the first process in distributed training processes the dataset,
+            # and the others will use the cache.
+            lock_path = cached_features_file + ".lock"
+            with FileLock(lock_path):
+
+                if os.path.exists(cached_features_file) and not overwrite_cache:
+                    logger.info(f"Loading features from cached file {cached_features_file}")
+                    self.features = torch.load(cached_features_file)
+                else:
+                    logger.info(f"Creating features from dataset file at {data_dir}")
+
+                    examples = (
+                        processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
+                    )
+
+                    logger.info("Training examples: %s", len(examples))
+                    self.features = hans_convert_examples_to_features(examples, label_list, max_seq_length, tokenizer)
+                    logger.info("Saving features into cached file %s", cached_features_file)
+                    torch.save(self.features, cached_features_file)
+
+        def __len__(self):
+            return len(self.features)
+
+        def __getitem__(self, i) -> InputFeatures:
+            return self.features[i]
+
+        def get_labels(self):
+            return self.label_list
+
+
+if is_tf_available():
+    import tensorflow as tf
+
+    class TFHansDataset:
+        """
+        This will be superseded by a framework-agnostic approach
+        soon.
+        """
+
+        features: List[InputFeatures]
+
+        def __init__(
+            self,
+            data_dir: str,
+            tokenizer: PreTrainedTokenizer,
+            task: str,
+            max_seq_length: Optional[int] = 128,
+            overwrite_cache=False,
+            evaluate: bool = False,
+        ):
+            processor = hans_processors[task]()
+            label_list = processor.get_labels()
+            if tokenizer.__class__ in (
+                RobertaTokenizer,
+                RobertaTokenizerFast,
+                XLMRobertaTokenizer,
+                BartTokenizer,
+                BartTokenizerFast,
+            ):
+                # HACK(label indices are swapped in RoBERTa pretrained model)
+                label_list[1], label_list[2] = label_list[2], label_list[1]
+            self.label_list = label_list
+
+            examples = processor.get_dev_examples(data_dir) if evaluate else processor.get_train_examples(data_dir)
+            self.features = hans_convert_examples_to_features(examples, label_list, max_seq_length, tokenizer)
+
+            def gen():
+                for (ex_index, ex) in tqdm.tqdm(enumerate(self.features), desc="convert examples to features"):
+                    if ex_index % 10000 == 0:
+                        logger.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+                    yield (
+                        {
+                            "example_id": 0,
+                            "input_ids": ex.input_ids,
+                            "attention_mask": ex.attention_mask,
+                            "token_type_ids": ex.token_type_ids,
+                        },
+                        ex.label,
+                    )
+
+            self.dataset = tf.data.Dataset.from_generator(
+                gen,
+                (
+                    {
+                        "example_id": tf.int32,
+                        "input_ids": tf.int32,
+                        "attention_mask": tf.int32,
+                        "token_type_ids": tf.int32,
+                    },
+                    tf.int64,
+                ),
+                (
+                    {
+                        "example_id": tf.TensorShape([]),
+                        "input_ids": tf.TensorShape([None, None]),
+                        "attention_mask": tf.TensorShape([None, None]),
+                        "token_type_ids": tf.TensorShape([None, None]),
+                    },
+                    tf.TensorShape([]),
+                ),
+            )
+
+        def get_dataset(self):
+            return self.dataset
+
+        def __len__(self):
+            return len(self.features)
+
+        def __getitem__(self, i) -> InputFeatures:
+            return self.features[i]
+
+        def get_labels(self):
+            return self.label_list
+
+
+class HansProcessor(DataProcessor):
+    """Processor for the HANS data set."""

    def get_train_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the train set."""
-        raise NotImplementedError()
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_train_set.txt")), "train")

    def get_dev_examples(self, data_dir):
-        """Gets a collection of `InputExample`s for the dev set."""
-        raise NotImplementedError()
+        """See base class."""
+        return self._create_examples(self._read_tsv(os.path.join(data_dir, "heuristics_evaluation_set.txt")), "dev")

    def get_labels(self):
-        """Gets the list of labels for this data set."""
-        raise NotImplementedError()
+        """See base class.
+        Note that we follow the standard three labels for MNLI
+        (see :class:`~transformers.data.processors.utils.MnliProcessor`)
+        but the HANS evaluation groups `contradiction` and `neutral` into `non-entailment` (label 0) while
+        `entailment` is label 1."""
+        return ["contradiction", "entailment", "neutral"]

-    @classmethod
-    def _read_tsv(cls, input_file, quotechar=None):
-        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8-sig") as f:
-            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-            lines = []
-            for line in reader:
-                lines.append(line)
-            return lines
+    def _create_examples(self, lines, set_type):
+        """Creates examples for the training and dev sets."""
+        examples = []
+        for (i, line) in enumerate(lines):
+            if i == 0:
+                continue
+            guid = "%s-%s" % (set_type, line[0])
+            text_a = line[5]
+            text_b = line[6]
+            pairID = line[7][2:] if line[7].startswith("ex") else line[7]
+            label = line[0]
+            examples.append(InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label, pairID=pairID))
+        return examples
+
+
+def hans_convert_examples_to_features(
+    examples: List[InputExample],
+    label_list: List[str],
+    max_length: int,
+    tokenizer: PreTrainedTokenizer,
+):
+    """
+    Loads a data file into a list of ``InputFeatures``
+
+    Args:
+        examples: List of ``InputExamples`` containing the examples.
+        tokenizer: Instance of a tokenizer that will tokenize the examples.
+        max_length: Maximum example length.
+        label_list: List of labels. Can be obtained from the processor using the ``processor.get_labels()`` method.
+        output_mode: String indicating the output mode. Either ``regression`` or ``classification``.
+
+    Returns:
+        A list of task-specific ``InputFeatures`` which can be fed to the model.
+
+    """
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in tqdm.tqdm(enumerate(examples), desc="convert examples to features"):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d" % (ex_index))
+
+        inputs = tokenizer(
+            example.text_a,
+            example.text_b,
+            add_special_tokens=True,
+            max_length=max_length,
+            padding="max_length",
+            truncation=True,
+            return_overflowing_tokens=True,
+        )
+
+        label = label_map[example.label] if example.label in label_map else 0
+
+        pairID = int(example.pairID)
+
+        features.append(InputFeatures(**inputs, label=label, pairID=pairID))
+
+    for i, example in enumerate(examples[:5]):
+        logger.info("*** Example ***")
+        logger.info(f"guid: {example}")
+        logger.info(f"features: {features[i]}")
+
+    return features
+
+
+hans_tasks_num_labels = {
+    "hans": 3,
+}
+
+hans_processors = {
+    "hans": HansProcessor,
+}
--- a/Show More
+++ b/Show More