Release: v4.2.2

Fix GPT conversion script (#9676 )
Fix imports in conversion scripts (#9674 )
2021-01-21 09:06:41 +01:00 · 2021-01-21 09:00:32 +01:00 · 2021-01-21 09:00:26 +01:00 · 2021-01-21 08:57:35 +01:00 · 2021-01-14 14:27:36 +01:00 · 2021-01-14 14:24:32 +01:00
1553 changed files with 159130 additions and 75480 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -3,6 +3,7 @@ orbs:
    gcp-gke: circleci/gcp-gke@1.0.4
    go: circleci/go@1.3.0

+
 # TPU REFERENCES
 references:
    checkout_ml_testing: &checkout_ml_testing
@@ -74,21 +75,21 @@ jobs:
            - checkout
            - restore_cache:
                  keys:
-                      - v0.3-torch_and_tf-{{ checksum "setup.py" }}
-                      - v0.3-{{ checksum "setup.py" }}
+                      - v0.4-torch_and_tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install git+https://github.com/huggingface/datasets
-            - run: pip install .[sklearn,tf-cpu,torch,testing]
-            - run: pip install codecov pytest-cov
+            - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
            - save_cache:
-                key: v0.3-{{ checksum "setup.py" }}
+                key: v0.4-{{ checksum "setup.py" }}
                paths:
                    - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s ./tests/ --cov  | tee output.txt
-            - run: codecov
+            - run: RUN_PT_TF_CROSS_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_torch_and_tf ./tests/ -m is_pt_tf_cross_test --durations=0 | tee tests_output.txt
            - store_artifacts:
-                  path: ~/transformers/output.txt
-                  destination: test_output.txt
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
    run_tests_torch:
        working_directory: ~/transformers
        docker:
@@ -101,19 +102,21 @@ jobs:
            - checkout
            - restore_cache:
                  keys:
-                      - v0.3-torch-{{ checksum "setup.py" }}
-                      - v0.3-{{ checksum "setup.py" }}
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install git+https://github.com/huggingface/datasets
-            - run: pip install .[sklearn,torch,testing]
+            - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
            - save_cache:
-                  key: v0.3-torch-{{ checksum "setup.py" }}
+                  key: v0.4-torch-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s ./tests/ | tee output.txt
+            - run: python -m pytest -n 8 --dist=loadfile -s --make-reports=tests_torch ./tests/ | tee tests_output.txt
            - store_artifacts:
-                  path: ~/transformers/output.txt
-                  destination: test_output.txt
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
    run_tests_tf:
        working_directory: ~/transformers
        docker:
@@ -126,42 +129,124 @@ jobs:
            - checkout
            - restore_cache:
                  keys:
-                      - v0.3-tf-{{ checksum "setup.py" }}
-                      - v0.3-{{ checksum "setup.py" }}
+                      - v0.4-tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install git+https://github.com/huggingface/datasets
-            - run: pip install .[sklearn,tf-cpu,testing]
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
            - save_cache:
-                  key: v0.3-tf-{{ checksum "setup.py" }}
+                  key: v0.4-tf-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s ./tests/ | tee output.txt
+            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_tf ./tests/ | tee tests_output.txt
            - store_artifacts:
-               path: ~/transformers/output.txt
-               destination: test_output.txt
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_tests_flax:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                keys:
+                    - v0.4-flax-{{ checksum "setup.py" }}
+                    - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: sudo pip install .[flax,sklearn,torch,testing,sentencepiece]
+            - save_cache:
+                  key: v0.4-flax-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_flax ./tests/ | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_tests_pipelines_torch:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-torch-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,torch,testing,sentencepiece]
+            - run: pip install tapas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cpu.html
+            - save_cache:
+                  key: v0.4-torch-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: RUN_PIPELINE_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_torch -m is_pipeline_test ./tests/ | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_tests_pipelines_tf:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        environment:
+            OMP_NUM_THREADS: 1
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - restore_cache:
+                  keys:
+                      - v0.4-tf-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install --upgrade pip
+            - run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
+            - save_cache:
+                  key: v0.4-tf-{{ checksum "setup.py" }}
+                  paths:
+                      - '~/.cache/pip'
+            - run: RUN_PIPELINE_TESTS=1 python -m pytest -n 8 --dist=loadfile -rA -s --make-reports=tests_pipelines_tf ./tests/ -m is_pipeline_test | tee tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
    run_tests_custom_tokenizers:
        working_directory: ~/transformers
        docker:
-            - image: circleci/python:3.6
+            - image: circleci/python:3.7
        environment:
            RUN_CUSTOM_TOKENIZERS: yes
        steps:
            - checkout
            - restore_cache:
                  keys:
-                      - v0.3-custom_tokenizers-{{ checksum "setup.py" }}
-                      - v0.3-{{ checksum "setup.py" }}
+                      - v0.4-custom_tokenizers-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[ja,testing]
+            - run: pip install .[ja,testing,sentencepiece]
            - run: python -m unidic download
            - save_cache:
-                  key: v0.3-custom_tokenizers-{{ checksum "setup.py" }}
+                  key: v0.4-custom_tokenizers-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -s ./tests/test_tokenization_bert_japanese.py | tee output.txt
+            - run: python -m pytest -s --make-reports=tests_custom_tokenizers ./tests/test_tokenization_bert_japanese.py | tee tests_output.txt
            - store_artifacts:
-                path: ~/transformers/output.txt
-                destination: test_output.txt
+                  path: ~/transformers/tests_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
    run_examples_torch:
        working_directory: ~/transformers
        docker:
@@ -174,19 +259,37 @@ jobs:
            - checkout
            - restore_cache:
                  keys:
-                      - v0.3-torch_examples-{{ checksum "setup.py" }}
-                      - v0.3-{{ checksum "setup.py" }}
+                      - v0.4-torch_examples-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[sklearn,torch,testing]
-            - run: pip install -r examples/requirements.txt
+            - run: pip install .[sklearn,torch,sentencepiece,testing]
+            - run: pip install -r examples/_tests_requirements.txt
            - save_cache:
-                  key: v0.3-torch_examples-{{ checksum "setup.py" }}
+                  key: v0.4-torch_examples-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: python -m pytest -n 8 --dist=loadfile -rA -s ./examples/ | tee output.txt
+            - run: python -m pytest -n 8 --dist=loadfile -s --make-reports=examples_torch ./examples/ | tee examples_output.txt
            - store_artifacts:
-                  path: ~/transformers/output.txt
-                  destination: test_output.txt
+                  path: ~/transformers/examples_output.txt
+            - store_artifacts:
+                  path: ~/transformers/reports
+
+    run_tests_git_lfs:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.7
+        resource_class: xlarge
+        parallelism: 1
+        steps:
+            - checkout
+            - run: sudo apt-get install git-lfs
+            - run: |
+                git config --global user.email "ci@dummy.com"
+                git config --global user.name "ci"
+            - run: pip install --upgrade pip
+            - run: pip install .[testing]
+            - run: RUN_GIT_LFS_TESTS=1 python -m pytest -sv ./tests/test_hf_api.py -k "HfLargefilesTest"
+
    build_doc:
        working_directory: ~/transformers
        docker:
@@ -195,17 +298,18 @@ jobs:
            - checkout
            - restore_cache:
                  keys:
-                      - v0.3-build_doc-{{ checksum "setup.py" }}
-                      - v0.3-{{ checksum "setup.py" }}
+                      - v0.4-build_doc-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
-            - run: pip install .[tf,torch,docs]
+            - run: pip install ."[all, docs]"
            - save_cache:
-                  key: v0.3-build_doc-{{ checksum "setup.py" }}
+                  key: v0.4-build_doc-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
            - run: cd docs && make html SPHINXOPTS="-W"
            - store_artifacts:
                path: ./docs/_build
+
    deploy_doc:
        working_directory: ~/transformers
        docker:
@@ -217,14 +321,15 @@ jobs:
            - checkout
            - restore_cache:
                  keys:
-                      - v0.3-deploy_doc-{{ checksum "setup.py" }}
-                      - v0.3-{{ checksum "setup.py" }}
-            - run: pip install .[tf,torch,docs]
+                      - v0.4-deploy_doc-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
+            - run: pip install ."[all,docs]"
            - save_cache:
-                  key: v0.3-deploy_doc-{{ checksum "setup.py" }}
+                  key: v0.4-deploy_doc-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
            - run: ./.circleci/deploy.sh
+
    check_code_quality:
        working_directory: ~/transformers
        docker:
@@ -235,20 +340,24 @@ jobs:
            - checkout
            - restore_cache:
                  keys:
-                      - v0.3-code_quality-{{ checksum "setup.py" }}
-                      - v0.3-{{ checksum "setup.py" }}
+                      - v0.4-code_quality-{{ checksum "setup.py" }}
+                      - v0.4-{{ checksum "setup.py" }}
            - run: pip install --upgrade pip
            - run: pip install isort
-            - run: pip install .[tf,torch,quality]
+            - run: pip install .[all,quality]
            - save_cache:
-                  key: v0.3-code_quality-{{ checksum "setup.py" }}
+                  key: v0.4-code_quality-{{ checksum "setup.py" }}
                  paths:
                      - '~/.cache/pip'
-            - run: black --check examples templates tests src utils
-            - run: isort --check-only examples templates tests src utils
-            - run: flake8 examples templates tests src utils
+            - run: black --check examples tests src utils
+            - run: isort --check-only examples tests src utils
+            - run: flake8 examples tests src utils
+            - run: python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
            - run: python utils/check_copies.py
+            - run: python utils/check_table.py
+            - run: python utils/check_dummies.py
            - run: python utils/check_repo.py
+
    check_repository_consistency:
        working_directory: ~/transformers
        docker:
@@ -279,6 +388,7 @@ jobs:
            - setup_remote_docker
            - *build_push_docker
            - *deploy_cluster
+
    cleanup-gke-jobs:
        docker:
            - image: circleci/python:3.6
@@ -288,6 +398,7 @@ jobs:
                  cluster: $GKE_CLUSTER
                  perform-login: true
            - *delete_gke_jobs
+
 workflow_filters: &workflow_filters
    filters:
        branches:
@@ -304,6 +415,10 @@ workflows:
            - run_tests_torch_and_tf
            - run_tests_torch
            - run_tests_tf
+            - run_tests_flax
+            - run_tests_pipelines_torch
+            - run_tests_pipelines_tf
+            - run_tests_git_lfs
            - build_doc
            - deploy_doc: *workflow_filters
    tpu_testing_jobs:
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -49,4 +49,9 @@ deploy_doc "10d7239" v2.10.0
 deploy_doc "b42586e" v2.11.0
 deploy_doc "7fb8bdf" v3.0.2
 deploy_doc "4b3ee9c" v3.1.0
-deploy_doc "3ebb1b3" # v3.2.0 Latest stable release
+deploy_doc "3ebb1b3" v3.2.0
+deploy_doc "0613f05" v3.3.1
+deploy_doc "eb0e0ce" v3.4.0
+deploy_doc "818878d" v3.5.1
+deploy_doc "c781171" v4.0.0
+deploy_doc "bfa4ccf" # v4.1.1 Latest stable release
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -11,7 +11,7 @@ assignees: ''
 ## Environment info
 <!-- You can run the command `transformers-cli env` and copy-and-paste its output below.
     Don't forget to fill out the missing fields in that output! -->
-     
+
 - `transformers` version:
 - Platform:
 - Python version:
@@ -24,27 +24,30 @@ assignees: ''
 <!-- Your issue will be replied to more quickly if you can figure out the right person to tag with @
 If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
 Please tag fewer than 3 people.
- 
- albert, bert, GPT2, XLM: @LysandreJik 
+
+ albert, bert, GPT2, XLM: @LysandreJik
 tokenizers: @mfuntowicz
 Trainer: @sgugger
 Speed and Memory Benchmarks: @patrickvonplaten
 Model Cards: @julien-c
- Translation: @sshleifer
- Summarization: @sshleifer
- TextGeneration: @TevenLeScao 
+ TextGeneration: @TevenLeScao
 examples/distillation: @VictorSanh
 nlp datasets: [different repo](https://github.com/huggingface/nlp)
 rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
- Text Generation: @TevenLeScao
- blenderbot: @mariamabarham
- Bart: @sshleifer
- Marian: @sshleifer
+ Text Generation: @patrickvonplaten @TevenLeScao
+ Blenderbot: @patrickvonplaten
+ Bart: @patrickvonplaten
+ Marian: @patrickvonplaten
+ Pegasus: @patrickvonplaten
+ mBART: @patrickvonplaten
 T5: @patrickvonplaten
 Longformer/Reformer: @patrickvonplaten
- TransfoXL/XLNet: @TevenLeScao 
- examples/seq2seq: @sshleifer
+ TransfoXL/XLNet: @TevenLeScao
+ RAG: @patrickvonplaten, @lhoestq
+ FSMT: @stas00
+ examples/seq2seq: @patil-suraj
 examples/bert-loses-patience: @JetRunner
+ ray/raytune: @richardliaw @amogkam
 tensorflow: @jplu
 examples/token-classification: @stefan-it
 documentation: @sgugger
--- a/.github/ISSUE_TEMPLATE/question-help.md
+++ b/.github/ISSUE_TEMPLATE/question-help.md
@@ -1,6 +1,6 @@
 ---
 name: "❓ Questions & Help"
-about: Post your general questions on the Hugging Face forum or Stack Overflow tagged huggingface-transformers
+about: Post your general questions on the Hugging Face forum: https://discuss.huggingface.co/
 title: ''
 labels: ''
 assignees: ''
@@ -10,18 +10,17 @@ assignees: ''
 # ❓ Questions & Help

 <!-- The GitHub issue tracker is primarly intended for bugs, feature requests,
-     new models and benchmarks, and migration questions. For all other questions,
+     new models, benchmarks, and migration questions. For all other questions,
     we direct you to the Hugging Face forum: https://discuss.huggingface.co/ .
-     You can also try Stack Overflow (SO) where a whole community of PyTorch and
-     Tensorflow enthusiast can help you out. In this case, make sure to tag your
-     question with the right deep learning framework as well as the
-     huggingface-transformers tag: 
-     https://stackoverflow.com/questions/tagged/huggingface-transformers 
     -->

 ## Details
+
 <!-- Description of your issue -->

-<!-- You should first ask your question on the forum or SO, and only if
-     you didn't get an answer ask it here on GitHub. -->
-**A link to original question on the forum/Stack Overflow**:
+<!-- You should first ask your question on the forum, and only if
+     you didn't get an answer after a few days ask it here on GitHub. -->
+
+**A link to original question on the forum**:
+
+<!-- Your issue will be closed if you don't fill this part. -->
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@@ -16,15 +16,15 @@ Fixes # (issue)


 ## Before submitting
- [ ] This PR fixes a typo or improves the docs (you can dimiss the other checks if that's the case).
- [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#start-contributing-pull-requests), 
+- [ ] This PR fixes a typo or improves the docs (you can dismiss the other checks if that's the case).
+- [ ] Did you read the [contributor guideline](https://github.com/huggingface/transformers/blob/master/CONTRIBUTING.md#start-contributing-pull-requests),
      Pull Request section?
 - [ ] Was this discussed/approved via a Github issue or the [forum](https://discuss.huggingface.co/)? Please add a link
-      to the it if that's the case.
+      to it if that's the case.
 - [ ] Did you make sure to update the documentation with your changes? Here are the
      [documentation guidelines](https://github.com/huggingface/transformers/tree/master/docs), and
      [here are tips on formatting docstrings](https://github.com/huggingface/transformers/tree/master/docs#writing-source-documentation).
- [ ] Did you write any new necessary tests? 
+- [ ] Did you write any new necessary tests?


 ## Who can review?
@@ -37,25 +37,26 @@ members/contributors which may be interested in your PR.
 If you know how to use git blame, that is the easiest way, otherwise, here is a rough guide of **who to tag**.
 Please tag fewer than 3 people.

- albert, bert, GPT2, XLM: @LysandreJik 
+ albert, bert, XLM: @LysandreJik
+ GPT2: @LysandreJik, @patrickvonplaten
 tokenizers: @mfuntowicz
 Trainer: @sgugger
- Speed and Memory Benchmarks: @patrickvonplaten
+ Benchmarks: @patrickvonplaten
 Model Cards: @julien-c
- Translation: @sshleifer
- Summarization: @sshleifer
- TextGeneration: @TevenLeScao 
 examples/distillation: @VictorSanh
 nlp datasets: [different repo](https://github.com/huggingface/nlp)
 rust tokenizers: [different repo](https://github.com/huggingface/tokenizers)
- Text Generation: @TevenLeScao
- Blenderbot, Bart, Marian, Pegasus: @sshleifer
+ Text Generation: @patrickvonplaten, @TevenLeScao
+ Blenderbot, Bart, Marian, Pegasus: @patrickvonplaten
 T5: @patrickvonplaten
- Longformer/Reformer: @patrickvonplaten
- TransfoXL/XLNet: @TevenLeScao 
- examples/seq2seq: @sshleifer
+ Rag: @patrickvonplaten, @lhoestq
+ EncoderDecoder: @patrickvonplaten
+ Longformer, Reformer: @patrickvonplaten
+ TransfoXL, XLNet: @TevenLeScao, @patrickvonplaten
+ examples/seq2seq: @patil-suraj
 examples/bert-loses-patience: @JetRunner
 tensorflow: @jplu
 examples/token-classification: @stefan-it
 documentation: @sgugger
- -->
+ FSMT: @stas00
+ -->
--- a/.github/conda/build.sh
+++ b/.github/conda/build.sh
@@ -0,0 +1 @@
+$PYTHON setup.py install     # Python command to install the script.
--- a/.github/conda/meta.yaml
+++ b/.github/conda/meta.yaml
@@ -0,0 +1,48 @@
+{% set name = "transformers" %}
+
+package:
+  name: "{{ name|lower }}"
+  version: "{{ TRANSFORMERS_VERSION }}"
+
+source:
+  path: ../../
+
+build:
+  noarch: python
+
+requirements:
+  host:
+    - python
+    - pip
+    - numpy
+    - dataclasses
+    - packaging
+    - filelock
+    - requests
+    - tqdm >=4.27
+    - sacremoses
+    - regex !=2019.12.17
+    - protobuf
+    - tokenizers ==0.9.4
+  run:
+    - python
+    - numpy
+    - dataclasses
+    - packaging
+    - filelock
+    - requests
+    - tqdm >=4.27
+    - sacremoses
+    - regex !=2019.12.17
+    - protobuf
+    - tokenizers ==0.9.4
+
+test:
+  imports:
+    - transformers
+
+about:
+  home: https://huggingface.co
+  license: Apache License 2.0
+  license_file: LICENSE
+  summary: "🤗Transformers: State-of-the-art Natural Language Processing for Pytorch and TensorFlow 2.0."
--- a/.github/stale.yml
+++ b/.github/stale.yml
@@ -6,6 +6,7 @@ daysUntilClose: 7
 exemptLabels:
  - pinned
  - security
+  - Feature request
 # Label to use when marking an issue as stale
 staleLabel: wontfix
 # Comment to post when marking an issue as stale. Set to `false` to disable
--- a/.github/workflows/github-torch-hub.yml
+++ b/.github/workflows/github-torch-hub.yml
@@ -1,6 +1,6 @@
 name: Torch hub integration

-on: 
+on:
  push:
    branches:
      - "*"
@@ -8,6 +8,9 @@ on:
 jobs:
  torch_hub_integration:
    runs-on: ubuntu-latest
+    env:
+      # TODO quickfix but may need more investigation
+      ACTIONS_ALLOW_UNSECURE_COMMANDS: True
    steps:
    # no checkout necessary here.
    - name: Extract branch name
@@ -29,8 +32,10 @@ jobs:
    - name: Install dependencies
      run: |
        pip install --upgrade pip
-        pip install torch
-        pip install numpy tokenizers filelock requests tqdm regex sentencepiece sacremoses packaging
+        # install torch-hub specific dependencies
+        pip install -e git+https://github.com/huggingface/transformers.git#egg=transformers[torchhub]
+        # no longer needed
+        pip uninstall -y transformers

    - name: Torch hub list
      run: |
--- a/.github/workflows/model-templates.yml
+++ b/.github/workflows/model-templates.yml
@@ -0,0 +1,67 @@
+name: Model templates runner
+
+on:
+  push:
+    paths:
+      - "src/**"
+      - "tests/**"
+      - ".github/**"
+      - "templates/**"
+
+jobs:
+  run_tests_templates:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v1
+
+      - name: Install Python
+        uses: actions/setup-python@v1
+        with:
+          python-version: 3.6
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: ~/.cache/pip
+          key: v1.2-tests_templates
+          restore-keys: |
+            v1.2-tests_templates-${{ hashFiles('setup.py') }}
+            v1.2-tests_templates
+
+      - name: Install dependencies
+        run: |
+          pip install --upgrade pip
+          pip install .[dev]
+      - name: Create model files
+        run: |
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
+          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-seq-2-seq-bart-tokenizer.json --path=templates/adding_a_new_model
+          make style
+          python utils/check_table.py --fix_and_overwrite
+          python utils/check_dummies.py --fix_and_overwrite
+
+      - name: Run all non-slow tests
+        run: |
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_templates tests/*template*
+
+      - name: Run style changes
+        run: |
+          git fetch origin master:master
+          make fixup
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_templates_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_templates_test_reports
+          path: reports
--- a/.github/workflows/release-conda.yml
+++ b/.github/workflows/release-conda.yml
@@ -0,0 +1,44 @@
+name: Release - Conda
+
+on:
+  push:
+    tags:
+      - v*
+
+env:
+  ANACONDA_API_TOKEN: ${{ secrets.ANACONDA_API_TOKEN }}
+
+jobs:
+  build_and_package:
+    runs-on: ubuntu-latest
+    defaults:
+      run:
+        shell: bash -l {0}
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v1
+
+      - name: Install miniconda
+        uses: conda-incubator/setup-miniconda@v2
+        with:
+          auto-update-conda: true
+          auto-activate-base: false
+          activate-environment: "build-transformers"
+          channels: huggingface
+
+      - name: Setup conda env
+        run: |
+          conda install -c defaults anaconda-client conda-build
+
+      - name: Extract version
+        run: echo "TRANSFORMERS_VERSION=`python setup.py --version`" >> $GITHUB_ENV
+
+      - name: Build conda packages
+        run: |
+          conda info
+          conda list
+          conda-build .github/conda
+
+      - name: Upload to Anaconda
+        run: anaconda upload `conda-build .github/conda --output` --force
--- a/.github/workflows/self-push.yml
+++ b/.github/workflows/self-push.yml
@@ -1,64 +1,275 @@
 name: Self-hosted runner (push)

-on: 
+on:
  push:
    branches:
      - master
-    paths: 
+      - ci_*
+    paths:
      - "src/**"
      - "tests/**"
      - ".github/**"
+      - "templates/**"
  # pull_request:
  repository_dispatch:


 jobs:
-  run_tests_torch_and_tf_gpu:
-    runs-on: self-hosted
+  run_tests_torch_gpu:
+    runs-on: [self-hosted, gpu, single-gpu]
    steps:
-    - uses: actions/checkout@v2
-    - name: Python version
-      run: |
-        which python
-        python --version
-        pip --version
-    - name: Current dir
-      run: pwd
-    - run: nvidia-smi
+      - uses: actions/checkout@v2
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version

-    - name: Loading cache.
-      uses: actions/cache@v2
-      id: cache
-      with:
-        path: .env
-        key: v0-tests_tf_torch_gpu-${{ hashFiles('setup.py') }}
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi

-    - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
-      run: |
-        python -m venv .env
-        source .env/bin/activate
-        which python
-        python --version
-        pip --version
-    - name: Install dependencies
-      run: |
-        source .env/bin/activate
-        pip install --upgrade pip
-        pip install torch!=1.6.0
-        pip install .[sklearn,testing,onnxruntime]
-        pip install git+https://github.com/huggingface/datasets
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_torch_gpu-${{ hashFiles('setup.py') }}

-    - name: Are GPUs recognized by our DL frameworks
-      run: |
-        source .env/bin/activate
-        python -c "import torch; print(torch.cuda.is_available())"
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version

-    - name: Run all non-slow tests on GPU
-      env:
-        TF_FORCE_GPU_ALLOW_GROWTH: "true"
-        # TF_GPU_MEMORY_LIMIT: 4096
-        OMP_NUM_THREADS: 1
-        USE_CUDA: yes
-      run: |
-        source .env/bin/activate
-        python -m pytest -n 2 --dist=loadfile -s ./tests/
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip install pandas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+#      - name: Create model files
+#        run: |
+#          source .env/bin/activate
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+
+      - name: Run all non-slow tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+          CUDA_VISIBLE_DEVICES: 0
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_gpu_failures_short.txt
+        
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_torch_gpu_test_reports
+          path: reports
+                  
+
+  run_tests_tf_gpu:
+    runs-on: [self-hosted, gpu, single-gpu]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_tf_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+      - name: Create model files
+        run: |
+          source .env/bin/activate
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/pt-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/standalone.json --path=templates/adding_a_new_model
+#          transformers-cli add-new-model --testing --testing_file=templates/adding_a_new_model/tests/tf-encoder-bert-tokenizer.json --path=templates/adding_a_new_model
+
+      - name: Run all non-slow tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+          CUDA_VISIBLE_DEVICES: 0
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_tf_gpu_failures_short.txt
+        
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_tf_gpu_test_reports
+          path: reports
+
+  run_tests_torch_multi_gpu:
+    runs-on: [self-hosted, gpu, multi-gpu]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip install pandas torch-scatter -f https://pytorch-geometric.com/whl/torch-1.7.0+cu102.html
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+      - name: Run all non-slow tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_multi_gpu_failures_short.txt          
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_torch_multi_gpu_test_reports
+          path: reports
+
+  run_tests_tf_multi_gpu:
+    runs-on: [self-hosted, gpu, multi-gpu]
+    steps:
+      - uses: actions/checkout@v2
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+      - name: Run all non-slow tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 2 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_tf_multi_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_tf_multi_gpu_test_reports
+          path: reports
+          
--- a/.github/workflows/self-scheduled.yml
+++ b/.github/workflows/self-scheduled.yml
@@ -1,72 +1,356 @@
+# configuration notes:
+#
+# - `source .env/bin/activate` is currently needed to be run first thing first in each step. Otherwise
+# the step uses the system-wide python interpreter.
+
 name: Self-hosted runner (scheduled)

 on:
-  push:
-    branches:
-      - ci_*
  repository_dispatch:
  schedule:
    - cron: "0 0 * * *"

 jobs:
-  run_all_tests_torch_and_tf_gpu:
-    runs-on: self-hosted
+  run_all_tests_torch_gpu:
+    runs-on: [self-hosted, gpu, single-gpu]
    steps:
-    - uses: actions/checkout@v2
+      - uses: actions/checkout@v2

-    - name: Loading cache.
-      uses: actions/cache@v2
-      id: cache
-      with:
-        path: .env
-        key: v0-slow_tests_tf_torch_gpu-${{ hashFiles('setup.py') }}
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v  1.1-slow_tests_torch_gpu-${{ hashFiles('setup.py') }}

-    - name: Python version
-      run: |
-        which python
-        python --version
-        pip --version
-    - name: Current dir
-      run: pwd
-    - run: nvidia-smi
-    - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
-      if: steps.cache.outputs.cache-hit != 'true'
-      run: |
-        python -m venv .env
-        source .env/bin/activate
-        which python
-        python --version
-        pip --version
-    - name: Install dependencies
-      run: |
-        source .env/bin/activate
-        pip install --upgrade pip
-        pip install torch!=1.6.0
-        pip install .[sklearn,testing,onnxruntime]
-        pip install git+https://github.com/huggingface/datasets
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version

-    - name: Are GPUs recognized by our DL frameworks
-      run: |
-        source .env/bin/activate
-        python -c "import torch; print(torch.cuda.is_available())"
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi

-    - name: Run all tests on GPU
-      env:
-        TF_FORCE_GPU_ALLOW_GROWTH: "true"
-        OMP_NUM_THREADS: 1
-        RUN_SLOW: yes
-        USE_CUDA: yes
-      run: |
-        source .env/bin/activate
-        python -m pytest -n 1 --dist=loadfile -s ./tests/
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version

-    - name: Run examples tests on GPU
-      env:
-        TF_FORCE_GPU_ALLOW_GROWTH: "true"
-        OMP_NUM_THREADS: 1
-        RUN_SLOW: yes
-        USE_CUDA: yes
-      run: |
-        source .env/bin/activate
-        pip install -r examples/requirements.txt
-        python -m pytest -n 1 --dist=loadfile -s examples
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+      - name: Run all tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_gpu_failures_short.txt
+        
+      - name: Run examples tests on GPU
+        if: ${{ always() }}
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          pip install -r examples/_tests_requirements.txt
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=examples_torch_gpu examples
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/examples_torch_gpu_failures_short.txt
+
+      - name: Run all pipeline tests on GPU
+        if: ${{ always() }}
+        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+          RUN_PIPELINE_TESTS: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_pipeline_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_torch_gpu_test_reports
+          path: reports
+
+
+  run_all_tests_tf_gpu:
+    runs-on: [self-hosted, gpu, single-gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-slow_tests_tf_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+      - name: Run all tests on GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_gpu tests
+          
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_tf_gpu_failures_short.txt
+
+      - name: Run all pipeline tests on GPU
+        if: ${{ always() }}
+        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+          RUN_PIPELINE_TESTS: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipelines_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_tf_pipelines_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_tf_gpu_test_reports
+          path: reports
+          
+  run_all_tests_torch_multi_gpu:
+    runs-on: [self-hosted, gpu, multi-gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-slow_tests_torch_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[torch,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          python -c "import torch; print('Cuda available:', torch.cuda.is_available())"
+          python -c "import torch; print('Number of GPUs available:', torch.cuda.device_count())"
+
+      - name: Run all tests on multi-GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_multi_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_multi_gpu_failures_short.txt
+
+      - name: Run examples tests on multi-GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_torch_examples_multi_gpu examples
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_examples_multi_gpu_failures_short.txt
+
+      - name: Run all pipeline tests on multi-GPU
+        if: ${{ always() }}
+        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+          RUN_PIPELINE_TESTS: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_torch_pipeline_multi_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_torch_pipeline_multi_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_torch_multi_gpu_test_reports
+          path: reports
+
+  run_all_tests_tf_multi_gpu:
+    runs-on: [self-hosted, gpu, multi-gpu]
+    steps:
+      - uses: actions/checkout@v2
+
+      - name: Loading cache.
+        uses: actions/cache@v2
+        id: cache
+        with:
+          path: .env
+          key: v1.1-slow_tests_tf_multi_gpu-${{ hashFiles('setup.py') }}
+
+      - name: Python version
+        run: |
+          which python
+          python --version
+          pip --version
+
+      - name: Current dir
+        run: pwd
+      - run: nvidia-smi
+
+      - name: Create new python env (on self-hosted runners we have to handle isolation ourselves)
+        if: steps.cache.outputs.cache-hit != 'true'
+        run: |
+          python -m venv .env
+          source .env/bin/activate
+          which python
+          python --version
+          pip --version
+
+      - name: Install dependencies
+        run: |
+          source .env/bin/activate
+          pip install --upgrade pip
+          pip install .[tf,sklearn,testing,onnxruntime,sentencepiece]
+          pip install git+https://github.com/huggingface/datasets
+          pip list
+
+      - name: Are GPUs recognized by our DL frameworks
+        run: |
+          source .env/bin/activate
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('TF GPUs available:', bool(tf.config.list_physical_devices('GPU')))"
+          TF_CPP_MIN_LOG_LEVEL=3 python -c "import tensorflow as tf; print('Number of TF GPUs available:', len(tf.config.list_physical_devices('GPU')))"
+
+      - name: Run all tests on multi-GPU
+        env:
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s --make-reports=tests_tf_multi_gpu tests
+
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_tf_multi_gpu_failures_short.txt
+
+      - name: Run all pipeline tests on multi-GPU
+        if: ${{ always() }}
+        env:
+          TF_FORCE_GPU_ALLOW_GROWTH: "true"
+          OMP_NUM_THREADS: 1
+          RUN_SLOW: yes
+          RUN_PIPELINE_TESTS: yes
+        run: |
+          source .env/bin/activate
+          python -m pytest -n 1 --dist=loadfile -s -m is_pipeline_test --make-reports=tests_tf_pipeline_multi_gpu tests
+          
+      - name: Failure short reports
+        if: ${{ always() }}
+        run: cat reports/tests_tf_pipeline_multi_gpu_failures_short.txt
+
+      - name: Test suite reports artifacts
+        if: ${{ always() }}
+        uses: actions/upload-artifact@v2
+        with:
+          name: run_all_tests_tf_multi_gpu_test_reports
+          path: reports
+          
--- a/.gitignore
+++ b/.gitignore
@@ -9,9 +9,11 @@ __pycache__/
 *.so

 # tests and logs
-tests/fixtures
+tests/fixtures/*
+!tests/fixtures/sample_text_no_unicode.txt
 logs/
 lightning_logs/
+lang_code_data/

 # Distribution / packaging
 .Python
@@ -131,7 +133,6 @@ dmypy.json
 tensorflow_code

 # Models
-models
 proc_data

 # examples
@@ -155,3 +156,9 @@ debug.env

 #ctags
 tags
+
+# pre-commit
+.pre-commit*
+
+# .lock
+*.lock
--- a/CODE_OF_CONDUCT.md
+++ b/CODE_OF_CONDUCT.md
@@ -0,0 +1,129 @@
+
+# Contributor Covenant Code of Conduct
+
+## Our Pledge
+
+We as members, contributors, and leaders pledge to make participation in our
+community a harassment-free experience for everyone, regardless of age, body
+size, visible or invisible disability, ethnicity, sex characteristics, gender
+identity and expression, level of experience, education, socio-economic status,
+nationality, personal appearance, race, religion, or sexual identity
+and orientation.
+
+We pledge to act and interact in ways that contribute to an open, welcoming,
+diverse, inclusive, and healthy community.
+
+## Our Standards
+
+Examples of behavior that contributes to a positive environment for our
+community include:
+
+* Demonstrating empathy and kindness toward other people
+* Being respectful of differing opinions, viewpoints, and experiences
+* Giving and gracefully accepting constructive feedback
+* Accepting responsibility and apologizing to those affected by our mistakes,
+  and learning from the experience
+* Focusing on what is best not just for us as individuals, but for the
+  overall community
+
+Examples of unacceptable behavior include:
+
+* The use of sexualized language or imagery, and sexual attention or
+  advances of any kind
+* Trolling, insulting or derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or email
+  address, without their explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+
+## Enforcement Responsibilities
+
+Community leaders are responsible for clarifying and enforcing our standards of
+acceptable behavior and will take appropriate and fair corrective action in
+response to any behavior that they deem inappropriate, threatening, offensive,
+or harmful.
+
+Community leaders have the right and responsibility to remove, edit, or reject
+comments, commits, code, wiki edits, issues, and other contributions that are
+not aligned to this Code of Conduct, and will communicate reasons for moderation
+decisions when appropriate.
+
+## Scope
+
+This Code of Conduct applies within all community spaces, and also applies when
+an individual is officially representing the community in public spaces.
+Examples of representing our community include using an official e-mail address,
+posting via an official social media account, or acting as an appointed
+representative at an online or offline event.
+
+## Enforcement
+
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported to the community leaders responsible for enforcement at
+feedback@huggingface.co.
+All complaints will be reviewed and investigated promptly and fairly.
+
+All community leaders are obligated to respect the privacy and security of the
+reporter of any incident.
+
+## Enforcement Guidelines
+
+Community leaders will follow these Community Impact Guidelines in determining
+the consequences for any action they deem in violation of this Code of Conduct:
+
+### 1. Correction
+
+**Community Impact**: Use of inappropriate language or other behavior deemed
+unprofessional or unwelcome in the community.
+
+**Consequence**: A private, written warning from community leaders, providing
+clarity around the nature of the violation and an explanation of why the
+behavior was inappropriate. A public apology may be requested.
+
+### 2. Warning
+
+**Community Impact**: A violation through a single incident or series
+of actions.
+
+**Consequence**: A warning with consequences for continued behavior. No
+interaction with the people involved, including unsolicited interaction with
+those enforcing the Code of Conduct, for a specified period of time. This
+includes avoiding interactions in community spaces as well as external channels
+like social media. Violating these terms may lead to a temporary or
+permanent ban.
+
+### 3. Temporary Ban
+
+**Community Impact**: A serious violation of community standards, including
+sustained inappropriate behavior.
+
+**Consequence**: A temporary ban from any sort of interaction or public
+communication with the community for a specified period of time. No public or
+private interaction with the people involved, including unsolicited interaction
+with those enforcing the Code of Conduct, is allowed during this period.
+Violating these terms may lead to a permanent ban.
+
+### 4. Permanent Ban
+
+**Community Impact**: Demonstrating a pattern of violation of community
+standards, including sustained inappropriate behavior,  harassment of an
+individual, or aggression toward or disparagement of classes of individuals.
+
+**Consequence**: A permanent ban from any sort of public interaction within
+the community.
+
+## Attribution
+
+This Code of Conduct is adapted from the [Contributor Covenant][homepage],
+version 2.0, available at
+https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.
+
+Community Impact Guidelines were inspired by [Mozilla's code of conduct
+enforcement ladder](https://github.com/mozilla/diversity).
+
+[homepage]: https://www.contributor-covenant.org
+
+For answers to common questions about this code of conduct, see the FAQ at
+https://www.contributor-covenant.org/faq. Translations are available at
+https://www.contributor-covenant.org/translations.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,3 +1,19 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 # How to contribute to transformers?

 Everyone is welcome to contribute, and we value everybody's contribution. Code
@@ -9,6 +25,9 @@ It also helps us if you spread the word: reference the library from blog posts
 on the awesome projects it made possible, shout out on Twitter every time it has
 helped you, or simply star the repo to say "thank you".

+Whichever way you choose to contribute, please be mindful to respect our
+[code of conduct](https://github.com/huggingface/transformers/blob/master/CODE_OF_CONDUCT.md).
+
 ## You can contribute in so many ways!

 There are 4 ways you can contribute to transformers:
@@ -93,7 +112,7 @@ folder.

 ## Start contributing! (Pull Requests)

-Before writing code, we strongly advise you to search through the exising PRs or
+Before writing code, we strongly advise you to search through the existing PRs or
 issues to make sure that nobody is already working on the same thing. If you are
 unsure, it is always a good idea to open an issue to get some feedback.

@@ -122,7 +141,7 @@ Follow these steps to start contributing:
   $ git checkout -b a-descriptive-name-for-my-changes
   ```

-   **do not** work on the `master` branch.
+   **Do not** work on the `master` branch.

 4. Set up a development environment by running the following command in a virtual environment:

@@ -176,13 +195,14 @@ Follow these steps to start contributing:
   ```bash
   $ make quality
   ```
-
   You can do the automatic style corrections and code verifications that can't be automated in one go:

   ```bash
   $ make fixup
   ```

+   This target is also optimized to only work with files modified by the PR you're working on.
+
   If you're modifying documents under `docs/source`, make sure to validate that
   they can still be built. This check also runs in CI. To run a local check
   make sure you have installed the documentation builder requirements, by
@@ -231,7 +251,7 @@ Follow these steps to start contributing:
 ### Checklist

 1. The title of your pull request should be a summary of its contribution;
-2. If your pull request adresses an issue, please mention the issue number in
+2. If your pull request addresses an issue, please mention the issue number in
   the pull request description to make sure they are linked (and people
   consulting the issue know you are working on it);
 3. To indicate a work in progress please prefix the title with `[WIP]`. These
@@ -304,3 +324,32 @@ Check our [documentation writing guide](https://github.com/huggingface/transform
 for more information.

 #### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md)
+
+
+### Develop on Windows
+
+On windows, you need to configure git to transform Windows `CRLF` line endings to Linux `LF` line endings:
+
+`git config core.autocrlf input`
+
+One way one can run the make command on Window is to pass by MSYS2:
+
+1. [Download MSYS2](https://www.msys2.org/), we assume to have it installed in C:\msys64
+2. Open the command line C:\msys64\msys2.exe (it should be available from the start menu)
+3. Run in the shell: `pacman -Syu` and install make with `pacman -S make`
+4. Add `C:\msys64\usr\bin` to your PATH environment variable.
+
+You can now use `make` from any terminal (Powershell, cmd.exe, etc) 🎉
+
+### Syncing forked master with upstream (HuggingFace) master
+
+To avoid pinging the upstream repository which adds reference notes to each upstream PR and sends unnessary notifications to the developers involved in these PRs, 
+when syncing the master branch of a forked repository, please, follow these steps:
+1. When possible, avoid syncing with the upstream using a branch and PR on the forked repository. Instead merge directly into the forked master.
+2. If a PR is absolutely necessary, use the following steps after checking out your branch:
+```
+$ git checkout -b your-branch-for-syncing
+$ git pull --squash --no-commit upstream master
+$ git commit -m '<your message without GitHub references>'
+$ git push --set-upstream origin your-branch-for-syncing
+```
--- a/ISSUES.md
+++ b/ISSUES.md
@@ -0,0 +1,275 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# How To Request Support
+
+This is an Open Source Project so please be mindful that like in any other project of this kind there is no obligation to answer all requests for help.
+
+However, we want to encourage you to ask for help whenever you think it's needed! We are happy about every  question we get because it allows us to better understand your needs, possible misunderstandings, and most importantly a way for you to help us make this library better. That being said, this document's main purpose is to provide guidelines at how you can formulate your requests to increase your chances to be understood and to get support.
+
+There are two main venues to receive support: [the forums](https://discuss.huggingface.co/) and [the GitHub issues](https://github.com/huggingface/transformers/issues).
+
+## The Forums
+
+[The user forums](https://discuss.huggingface.co/) are supported by the wide community of the library users and backed up by developers when needed.
+
+If you have a difficulty with deploying this library or some questions, or you'd like to discuss a new feature, please first consider discussing those things at the forums. Only when you feel your subject matter has been crystalized and you still need support from the library developers do proceed to file an [issue](https://github.com/huggingface/transformers/issues).
+
+In particular all "Please explain" questions or objectively very user-specific feature requests belong to the forums. Here are some example of such questions:
+
+* "I would like to use a BertModel within a RL-Agent for a customer support service. How can I use a BertForMaskedLM in my ChatBotModel?"
+
+* "Could you please explain why T5 has no positional embedding matrix under T5Model?"
+
+* "How should I set my generation parameters for translation?"
+
+* "How to train T5 on De->En translation?"
+
+
+## The GitHub Issues
+
+Everything which hints at a bug should be opened as an [issue](https://github.com/huggingface/transformers/issues).
+
+You are not required to read the following guidelines before opening an issue. However, if you notice that your issue doesn't get any replies, chances are that the developers have one or several difficulties with its quality. In this case, reading the following points and adjusting your issue accordingly could help.
+
+1. Before posting an issue, first search for already posted issues, since chances are someone has already asked a similar question before you.
+
+    If you use Google your search query should be:
+
+    ```
+    "huggingface" "transformers" your query
+    ```
+
+    The first two quoted words tell Google to limit the search to the context of the Huggingface Transformers. The remainder is your query - most commonly this would be the error message the software fails with. We will go deeper into details shortly.
+
+    The results of such a query will typically match GitHub issues, Hugging Face forums, StackExchange, and blogs.
+
+    If you find relevant hints, you may choose to continue the discussion there if you have follow up questions.
+
+    If what you found is similar but doesn't quite answer your problem, please, post a new issue and do include links to similar issues or forum discussions you may have found.
+
+    Let's look at some examples:
+
+    The error message, often referred to as an assertion, tells us what went wrong. Here is an example of an assertion:
+
+   ```python
+   Traceback (most recent call last):
+     File "<string>", line 1, in <module>
+     File "/transformers/src/transformers/__init__.py", line 34, in <module>
+       from . import dependency_versions_check
+     File "/transformers/src/transformers/dependency_versions_check.py", line 34, in <module>
+       from .file_utils import is_tokenizers_available
+     File "/transformers/src/transformers/file_utils.py", line 40, in <module>
+       from tqdm.auto import tqdm
+    ModuleNotFoundError: No module named 'tqdm.auto'
+    ```
+
+   and it typically includes a traceback, so that we can see the full stack of calls the program made before it fails. This gives us the context to know why the program failed.
+
+   Going back to the above example. If you received this error search, look at the very last line of the error which is:
+
+   ```python
+    ModuleNotFoundError: No module named 'tqdm.auto'
+    ```
+
+    And now we can use it to do the searching on your favorite search engine:
+
+    1. first for `"huggingface" "transformers" "ModuleNotFoundError: No module named 'tqdm.auto'"`
+    2. if you don't find relevant results, then search for just `"ModuleNotFoundError: No module named 'tqdm.auto'"`
+    3. and finally if nothing still comes up, then remove the outside quotes: `ModuleNotFoundError: No module named 'tqdm.auto'`
+
+   If the error includes any messages that include bits unique to your filesystem, always remove those in the search query since other users will not have the same filesystem as yours. For example:
+
+   ```bash
+   python -c 'open("/tmp/wrong_path.txt", "r")'
+   Traceback (most recent call last):
+     File "<string>", line 1, in <module>
+   FileNotFoundError: [Errno 2] No such file or directory: '/tmp/wrong_path.txt'
+   ```
+   Here you'd search for just: `"FileNotFoundError: [Errno 2] No such file or directory"`
+
+   If the local information that you removed were inside the error message and you removed them you may need to remove double quotes since your query is no longer exact. So if the error message was something like:
+
+   ```bash
+      ValueError: '/tmp/wrong_path.txt' cannot be found
+   ```
+
+   then you'd search for `"ValueError" "cannot be found"`
+
+   As you search you will notice that when you don't use quotes often the search engines will return a variety of unrelated hits, which may or may not be what you want.
+
+   Experiment with different ways and find which approach gives the most satisfactory results.
+
+2. Keep the issue short, providing the information that you think will aid the developers to understand your situation. Put yourself in the shoes of the person who has never seen your code or knows anything about your custom setup. This mental exercise will help to develop an intuition to what/what not to share"
+
+3. If there is a software failure, always provide the full traceback, for example:
+
+   ```python
+   $ python -c 'import transformers'
+   Traceback (most recent call last):
+     File "<string>", line 1, in <module>
+     File "/transformers/src/transformers/__init__.py", line 34, in <module>
+       from . import dependency_versions_check
+     File "/transformers/src/transformers/dependency_versions_check.py", line 34, in <module>
+       from .file_utils import is_tokenizers_available
+     File "/transformers/src/transformers/file_utils.py", line 40, in <module>
+       from tqdm.auto import tqdm
+   ModuleNotFoundError: No module named 'tqdm.auto'
+   ```
+
+   As compared to providing just the last line of the error message, e.g.:
+   ```python
+   ModuleNotFoundError: No module named 'tqdm.auto'
+   ```
+   which is not sufficient.
+
+   If your application is running on more than one GPU (e.g. under `DistributedDataParallel`) and typically getting every log and traceback printed multiple times, please make sure that you paste only one copy of it. At times the traceback from parallel processes may get interleaved - so either disentangle these or change the loggers to log only for `local_rank==0` so that only one process logs things.
+
+4. When quoting a traceback, command line instructions and any type of code always enclose it in triple backticks inside the editor window, that is:
+
+   ````
+   ```
+   git clone https://github.com/huggingface/transformers
+   cd transformers
+   pip install .
+   ```
+   ````
+
+   If it's a command line with a long argument list, please consider breaking it down using backslashes and new lines. Here is an example of a good command line quote:
+
+   ```bash
+    cd examples/seq2seq
+    python -m torch.distributed.launch --nproc_per_node=2 ./finetune_trainer.py \
+    --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
+    --output_dir output_dir --overwrite_output_dir \
+    --do_train --n_train 500 --num_train_epochs 1 \
+    --per_device_train_batch_size 1  --freeze_embeds \
+    --src_lang en_XX --tgt_lang ro_RO --task translation \
+    --fp16 --sharded_ddp
+   ```
+
+   If you don't break it up, one has to scroll horizontally which often makes it quite difficult to quickly see what's happening.
+
+   The backslashes allow us to copy the command directly into the console to run it, without needing to edit it.
+
+5. Include only the important information that you think will help the developer to quickly identify the problem.
+
+   For example applications often create huge amounts of logs. Ask yourself whether providing all or parts of the log is useful.
+
+   Pasting a 100-1000 lines of log into the issue is an immediate turn off, since it will take a lot of time to figure out where the pertinent parts of the log are.
+
+   Attaching a full log can be helpful if it's done as an attachment, if it's enclosed in the following html code in the comment editor window:
+
+   ```
+   <details>
+   <summary>Full log</summary>
+   <pre>
+
+   many
+   lines
+   go
+   here
+
+   </pre>
+   </details>
+   ```
+
+   which would result in the following entry, which can be opened if desired, but otherwise takes little space.
+
+   <details>
+   <summary>Full log</summary>
+   <pre>
+   many
+   lines
+   go
+   here
+   </pre>
+   </details>
+
+    You could also provide a link to a pastebin service, but this is less beneficial since those links tend to expire quickly and future readers of your issue might not be able to access that log file anymore and may lack some context.
+
+6. If this is an issue in your code, do try to reduce that code to a minimal example that still demonstrates the problem. Please ask at the forums if you have a hard time figuring how to do that. Please realize that we don't have the luxury of having time to try and understand all of your custom code.
+
+   If you really tried to make a short reproducible code but couldn't figure it out, it might be that having a traceback will give the developer enough information to know what's going on. But if it is not enough and we can't reproduce the problem, we can't really solve it.
+
+   Do not dispair if you can't figure it out from the begining, just share what you can and perhaps someone else will be able to help you at the forums.
+
+7. If you forked off some of this project's code or example applications, please, do not ask us to go into your code repository and figure out what you may have done. The code is already very complex and unless there is an easy way to do a diff and it's a small diff, it won't be possible to find someone with time on their hands to make a lengthy investigation. Albeit, you might find someone at the forums who will be generous to do this for you.
+
+8. Before reporting an issue, first, always try to update your environment to the latest official version of this library. We have no resources to go and debug older revisions, which could easily have bugs that have been fixed in the latest released version.
+
+   We understand that this is not always possible, especially when APIs change, in which case file an issue against the highest library version your environment can support.
+
+   Of course, if you upgrade the library, always retest that the problem is still there.
+
+9. Please do not ask us to reproduce an issue with your custom data, since we don't have it. So, either you should use some existing dataset supported by HF datasets or you need to supply a code that generates a small sample on the fly, or some another quick and simple way to get it.
+
+   Please do not send us any non-public domain data that may require a license or a permission to be used.
+
+10. Do not tag multiple developers on the issue unless you know this is expected, either because you asked them and they gave you an explicit permission to tag them or the issue template instructs you to do so.
+
+   The "who to tag for what domain" part of the issue template is there to help users direct their questions to the right developers who are designated maintainers of project's specific domains. They can then decide at their own discretion to tag other developers if they feel it'd help move the issue forward.
+
+   We currently don't have a triage service and we trust your capacity to identify the right domain and thus the persons to tag in your issue. If you are not sure, please use the forums to ask for guidance.
+
+   When in doubt, err on the side of not tagging a given person. If you tag multiple people out of context or permission don't be surprised if you get no response at all. Please remember that every time you tag someone, they get a notification and you're taking their time without their permission. Please be sensitive to that.
+
+   If you got helped by one of the developers in the past please don't tag them in future issues, unless they are listed in the issue template for the domain you are asking about or that developer gave you an explicit permission to tag them in future issues.
+
+   If you see a certain developer doing multiple and/or recent commits into a specific area of the project that you feel is relevant to your issue, it is not a good reason to tag them. Various developers may be fixing things that prevent them from moving forward, but often their work is focused on a totally different domain. And while they may or may not know how to help you with the problem at hand, it would benefit the whole community much more if they focus on the domain of their unique expertise.
+
+11. Use the Edit button. Take your time, and re-read and improve the wording and formatting to make your posts and comments as easy to understand as possible.
+
+    Avoid posting multiple comments in a row, as each comment generates a notification for the developers tagged in that issue. If you happened to post multiple comments in a row, and nobody followed up yet - consider merging those into one or a few comments while editing the combined content to be coherent.
+
+    If you choose to edit your older comments after others posted follow up comments you need to be aware that your modifications might not be noticed, so if it's not a typo fixing, try to write a new comment flagging that something has been changed in the previous comments.
+
+    For example, the very first comment is the most important one. If while the thread unfolds you realize that things aren't as they seemed to you originally you may want to edit the first post to reflect the up-to-date understanding of the issue at hand so that it helps those who read your issue in the future quickly understand what's going on and not need to sift through dozens of comments. It also helps to indicate that the post was edited. So, those reading the thread later can understand why there might be certain discontinuity in the information flow.
+
+    Use bullets and items if you have lists of items and the outcome improves overall readability.
+
+    Use backticks to refer to class and function names, e.g. `BartModel` and `generate` as these stand out and improve the speed of a reader's comprehension.
+
+    Try not use italics and bold text too much as these often make the text more difficult to read.
+
+
+12. If you are cross-referencing a specific comment in a given thread or another issue, always link to that specific comment, rather than using the issue link. If you do the latter it could be quite impossible to find which specific comment you're referring to.
+
+    To get the link to the specific comment do not copy the url from the location bar of your browser, but instead, click the `...` icon in the upper right corner of the comment and then select "Copy Link".
+
+    For example the first link is a link to an issue, and the second to a specific comment in the same issue:
+
+    1. https://github.com/huggingface/transformers/issues/9257
+    2. https://github.com/huggingface/transformers/issues/9257#issuecomment-749945162
+
+
+13. If you are replying to a last comment, it's totally fine to make your reply with just your comment in it. The readers can follow the information flow here.
+
+    But if you're replying to a comment that happened some comments back it's always a good practice to quote just the relevant lines you're replying it. The `>` is used for quoting, or you can always use the menu to do so. For example your editor box will look like:
+
+    ```
+    > How big is your gpu cluster?
+
+    Our cluster is made of 256 gpus.
+    ```
+
+    If you are addressing multiple comments, quote the relevant parts of each before your answer. Some people use the same comment to do multiple replies, others separate them into separate comments. Either way works. The latter approach helps for linking to a specific comment.
+
+In general the best way to figure out what works the best is learn from issues posted by other people - see which issues get great responses and which get little to no response - observe what the posters who received great responses did differently from those who did not.
+
+Thank you for reading this somewhat lengthy document. We would like to conclude that these are not absolute rules, but a friendly advice that will help maximize the chances for us to understand what you are trying to communicate, reproduce the problem then resolve it to your satisfaction and the benefit of the whole community.
+
+If after reading this document there are remaining questions on how and why or there is a need for further elucidation, please, don't hesitate to ask your question in [this thread](https://discuss.huggingface.co/t/how-to-request-support/3128).
--- a/1
+++ b/1
@@ -1,3 +1,4 @@
+Copyright 2018- The Hugging Face team. All rights reserved.

                                 Apache License
                           Version 2.0, January 2004
--- a/51
+++ b/51
@@ -1,29 +1,58 @@
-.PHONY: quality_checks quality style fixup test test-examples docs
+.PHONY: deps_table_update modified_only_fixup extra_quality_checks quality style fixup fix-copies test test-examples docs
+
+
+check_dirs := examples tests src utils
+
+modified_only_fixup:
+	$(eval modified_py_files := $(shell python utils/get_modified_files.py $(check_dirs)))
+	@if test -n "$(modified_py_files)"; then \
+		echo "Checking/fixing $(modified_py_files)"; \
+		black $(modified_py_files); \
+		isort $(modified_py_files); \
+		flake8 $(modified_py_files); \
+	else \
+		echo "No library .py files were modified"; \
+	fi
+
+# Update src/transformers/dependency_versions_table.py
+
+deps_table_update:
+	@python setup.py deps_table_update

 # Check that source code meets quality standards

-quality_checks:
-	flake8 examples templates tests src utils
+extra_quality_checks: deps_table_update
 	python utils/check_copies.py
+	python utils/check_table.py
+	python utils/check_dummies.py
 	python utils/check_repo.py
+	python utils/style_doc.py src/transformers docs/source --max_len 119

+# this target runs checks on all files
 quality:
-	black --check examples templates tests src utils
-	isort --check-only examples templates tests src utils
-	${MAKE} quality_checks
+	black --check $(check_dirs)
+	isort --check-only $(check_dirs)
+	flake8 $(check_dirs)
+	python utils/style_doc.py src/transformers docs/source --max_len 119 --check_only
+	${MAKE} extra_quality_checks

 # Format source code automatically and check is there are any problems left that need manual fixing

-style:
-	black examples templates tests src utils
-	isort examples templates tests src utils
+style: deps_table_update
+	black $(check_dirs)
+	isort $(check_dirs)
+	python utils/style_doc.py src/transformers docs/source --max_len 119

-fixup: style quality_checks
+# Super fast fix and check target that only works on relevant modified files since the branch was made
+
+fixup: modified_only_fixup extra_quality_checks

 # Make marked copies of snippets of codes conform to the original

 fix-copies:
 	python utils/check_copies.py --fix_and_overwrite
+	python utils/check_table.py --fix_and_overwrite
+	python utils/check_dummies.py --fix_and_overwrite

 # Run tests for the library

@@ -38,4 +67,4 @@ test-examples:
 # Check that docs can build

 docs:
-	cd docs && make html SPHINXOPTS="-W"
+	cd docs && make html SPHINXOPTS="-W -j 4"
--- a/README.md
+++ b/README.md
@@ -1,3 +1,19 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 <p align="center">
    <br>
    <img src="https://raw.githubusercontent.com/huggingface/transformers/master/docs/source/imgs/transformers_logo_name.png" width="400"/>
@@ -16,26 +32,26 @@
    <a href="https://github.com/huggingface/transformers/releases">
        <img alt="GitHub release" src="https://img.shields.io/github/release/huggingface/transformers.svg">
    </a>
+    <a href="https://github.com/huggingface/transformers/blob/master/CODE_OF_CONDUCT.md">
+        <img alt="Contributor Covenant" src="https://img.shields.io/badge/Contributor%20Covenant-v2.0%20adopted-ff69b4.svg">
+    </a>
 </p>

 <h3 align="center">
 <p>State-of-the-art Natural Language Processing for PyTorch and TensorFlow 2.0
 </h3>

-🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation, etc in 100+ languages. Its aim is to make cutting-edge NLP easier to use for everyone. 
+🤗 Transformers provides thousands of pretrained models to perform tasks on texts such as classification, information extraction, question answering, summarization, translation, text generation, etc in 100+ languages. Its aim is to make cutting-edge NLP easier to use for everyone.

-🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets then share them with the community on our [model hub](https://huggingface.co/models). At the same time, each python module defining an architecture can be used as a standalone and modified to enable quick research experiments. 
+🤗 Transformers provides APIs to quickly download and use those pretrained models on a given text, fine-tune them on your own datasets then share them with the community on our [model hub](https://huggingface.co/models). At the same time, each python module defining an architecture can be used as a standalone and modified to enable quick research experiments.

 🤗 Transformers is backed by the two most popular deep learning libraries, [PyTorch](https://pytorch.org/) and [TensorFlow](https://www.tensorflow.org/), with a seamless integration between them, allowing you to train your models with one then load it for inference with the other.

-### Recent contributors
-[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/0)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/0)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/1)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/1)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/2)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/2)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/3)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/3)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/4)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/4)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/5)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/5)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/6)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/6)[![](https://sourcerer.io/fame/clmnt/huggingface/transformers/images/7)](https://sourcerer.io/fame/clmnt/huggingface/transformers/links/7)
-
 ## Online demos

-You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer an [inference API](https://huggingface.co/pricing) to use those models.
+You can test most of our models directly on their pages from the [model hub](https://huggingface.co/models). We also offer [private model hosting, versioning, & an inference API](https://huggingface.co/pricing) to use those models.

-Here are a few examples: 
+Here are a few examples:
 - [Masked word completion with BERT](https://huggingface.co/bert-base-uncased?text=Paris+is+the+%5BMASK%5D+of+France)
 - [Name Entity Recognition with Electra](https://huggingface.co/dbmdz/electra-large-discriminator-finetuned-conll03-english?text=My+name+is+Sarah+and+I+live+in+London+city)
 - [Text generation with GPT-2](https://huggingface.co/gpt2?text=A+long+time+ago%2C+)
@@ -48,7 +64,7 @@ Here are a few examples:

 ## Quick tour

-To immediately use a model on a given text, we provide the `pipeline` API. Pipelines group together a pretrained model with the preprocessing that was used during that model training. Here is how to quickly use a pipeline to classify positive versus negative texts 
+To immediately use a model on a given text, we provide the `pipeline` API. Pipelines group together a pretrained model with the preprocessing that was used during that model training. Here is how to quickly use a pipeline to classify positive versus negative texts

 ```python
 >>> from transformers import pipeline
@@ -59,7 +75,7 @@ To immediately use a model on a given text, we provide the `pipeline` API. Pipel
 [{'label': 'POSITIVE', 'score': 0.9978193640708923}]
 ```

-The second line of code downloads and caches the pretrained model used by the pipeline, the third line evaluates it on the given text. Here the answer is "positive" with a confidence of 99.8%. 
+The second line of code downloads and caches the pretrained model used by the pipeline, the third line evaluates it on the given text. Here the answer is "positive" with a confidence of 99.8%.

 This is another example of pipeline used for that can extract question answers from some context:

@@ -78,7 +94,7 @@ This is another example of pipeline used for that can extract question answers f

 On top of the answer, the pretrained model used here returned its confidence score, along with the start position and its end position in the tokenized sentence. You can learn more about the tasks supported by the `pipeline` API in [this tutorial](https://huggingface.co/transformers/task_summary.html).

-To download and use any of the pretrained models on your given task, you just need to use those three lines of codes (PyTorch verison):
+To download and use any of the pretrained models on your given task, you just need to use those three lines of codes (PyTorch version):
 ```python
 >>> from transformers import AutoTokenizer, AutoModel

@@ -108,7 +124,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta
 1. Easy-to-use state-of-the-art models:
    - High performance on NLU and NLG tasks.
    - Low barrier to entry for educators and practitioners.
-    - Few user-facing abastractions with just three classes to learn.
+    - Few user-facing abstractions with just three classes to learn.
    - A unified API for using all our pretrained models.

 1. Lower compute costs, smaller carbon footprint:
@@ -124,7 +140,7 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta
 1. Easily customize a model or an example to your needs:
    - Examples for each architecture to reproduce the results by the official authors of said architecture.
    - Expose the models internal as consistently as possible.
-    - Model files can be used independently of the library for quick experiments. 
+    - Model files can be used independently of the library for quick experiments.

 ## Why shouldn't I use transformers?

@@ -134,14 +150,16 @@ The model itself is a regular [Pytorch `nn.Module`](https://pytorch.org/docs/sta

 ## Installation

+### With pip
+
 This repository is tested on Python 3.6+, PyTorch 1.0.0+ (PyTorch 1.3.1+ for [examples](https://github.com/huggingface/transformers/tree/master/examples)) and TensorFlow 2.0.

 You should install 🤗 Transformers in a [virtual environment](https://docs.python.org/3/library/venv.html). If you're unfamiliar with Python virtual environments, check out the [user guide](https://packaging.python.org/guides/installing-using-pip-and-virtual-environments/).

 First, create a virtual environment with the version of Python you're going to use and activate it.

-Then, you will need to install one of, or both, TensorFlow 2.0 and PyTorch.
-Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
+Then, you will need to install at least one of TensorFlow 2.0, PyTorch or Flax.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available), [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform and/or [Flax installation page](https://github.com/google/flax#quick-install).

 When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:

@@ -151,41 +169,70 @@ pip install transformers

 If you'd like to play with the examples, you must [install the library from source](https://huggingface.co/transformers/installation.html#installing-from-source).

+### With conda
+
+Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
+
+🤗 Transformers can be installed using conda as follows:
+
+```shell script
+conda install -c huggingface transformers
+```
+
+Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda. 
+
 ## Models architectures

+**[All the model checkpoints](https://huggingface.co/models)** provided by 🤗 Transformers are seamlessly integrated from the huggingface.co [model hub](https://huggingface.co) where they are uploaded directly by [users](https://huggingface.co/users) and [organizations](https://huggingface.co/organizations).
+
+Current number of checkpoints: ![](https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen)
+
 🤗 Transformers currently provides the following architectures (see [here](https://huggingface.co/transformers/model_summary.html) for a high-level summary of each them):

+1. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+1. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+1. **[BARThez](https://huggingface.co/transformers/model_doc/barthez.html)** (from École polytechnique) released with the paper [BARThez: a Skilled Pretrained French Sequence-to-Sequence Model](https://arxiv.org/abs/2010.12321) by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis.
 1. **[BERT](https://huggingface.co/transformers/model_doc/bert.html)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
-2. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
-3. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
-4. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
-5. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
-6. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-7. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
-9. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-10. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
-11. **[ALBERT](https://huggingface.co/transformers/model_doc/albert.html)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-12. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-13. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
-14. **[MMBT](https://github.com/facebookresearch/mmbt/)** (from Facebook), released together with the paper a [Supervised Multimodal Bitransformers for Classifying Images and Text](https://arxiv.org/pdf/1909.02950.pdf) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
-15. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
-16. **[BART](https://huggingface.co/transformers/model_doc/bart.html)** (from Facebook) released with the paper [BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension](https://arxiv.org/pdf/1910.13461.pdf) by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
-17. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
-18. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-19. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
-20. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
-21. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
-22. **[DPR](https://github.com/facebookresearch/DPR)** (from Facebook) released with the paper [Dense Passage Retrieval
+1. **[BERT For Sequence Generation](https://huggingface.co/transformers/model_doc/bertgeneration.html)** (from Google) released with the paper [Leveraging Pre-trained Checkpoints for Sequence Generation Tasks](https://arxiv.org/abs/1907.12461) by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
+1. **[Blenderbot](https://huggingface.co/transformers/model_doc/blenderbot.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[BlenderbotSmall](https://huggingface.co/transformers/model_doc/blenderbot_small.html)** (from Facebook) released with the paper [Recipes for building an open-domain chatbot](https://arxiv.org/abs/2004.13637) by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+1. **[CamemBERT](https://huggingface.co/transformers/model_doc/camembert.html)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+1. **[CTRL](https://huggingface.co/transformers/model_doc/ctrl.html)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+1. **[DeBERTa](https://huggingface.co/transformers/model_doc/deberta.html)** (from Microsoft Research) released with the paper [DeBERTa: Decoding-enhanced BERT with Disentangled Attention](https://arxiv.org/abs/2006.03654) by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen.
+1. **[DialoGPT](https://huggingface.co/transformers/model_doc/dialogpt.html)** (from Microsoft Research) released with the paper [DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation](https://arxiv.org/abs/1911.00536) by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+1. **[DistilBERT](https://huggingface.co/transformers/model_doc/distilbert.html)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation), RoBERTa into [DistilRoBERTa](https://github.com/huggingface/transformers/tree/master/examples/distillation), Multilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
+1. **[DPR](https://huggingface.co/transformers/model_doc/dpr.html)** (from Facebook) released with the paper [Dense Passage Retrieval
 for Open-Domain Question Answering](https://arxiv.org/abs/2004.04906) by Vladimir Karpukhin, Barlas Oğuz, Sewon
 Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-23. **[Pegasus](https://github.com/google-research/pegasus)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-24. **[MBart](https://github.com/pytorch/fairseq/tree/master/examples/mbart)** (from Facebook) released with the paper  [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.  
-25. **[LXMERT](https://github.com/airsplay/lxmert)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
-26. **[Funnel Transformer](https://github.com/laiguokun/Funnel-Transformer)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-27. **[LayoutLM](https://github.com/microsoft/unilm/tree/master/layoutlm)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-28. **[Other community models](https://huggingface.co/models)**, contributed by the [community](https://huggingface.co/users).
-29. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+1. **[ELECTRA](https://huggingface.co/transformers/model_doc/electra.html)** (from Google Research/Stanford University) released with the paper [ELECTRA: Pre-training text encoders as discriminators rather than generators](https://arxiv.org/abs/2003.10555) by Kevin Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+1. **[FlauBERT](https://huggingface.co/transformers/model_doc/flaubert.html)** (from CNRS) released with the paper [FlauBERT: Unsupervised Language Model Pre-training for French](https://arxiv.org/abs/1912.05372) by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+1. **[Funnel Transformer](https://huggingface.co/transformers/model_doc/funnel.html)** (from CMU/Google Brain) released with the paper [Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing](https://arxiv.org/abs/2006.03236) by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+1. **[GPT](https://huggingface.co/transformers/model_doc/gpt.html)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+1. **[GPT-2](https://huggingface.co/transformers/model_doc/gpt2.html)** (from OpenAI) released with the paper [Language Models are Unsupervised Multitask Learners](https://blog.openai.com/better-language-models/) by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+1. **[LayoutLM](https://huggingface.co/transformers/model_doc/layoutlm.html)** (from Microsoft Research Asia) released with the paper [LayoutLM: Pre-training of Text and Layout for Document Image Understanding](https://arxiv.org/abs/1912.13318) by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+1. **[LED](https://huggingface.co/transformers/model_doc/led.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[Longformer](https://huggingface.co/transformers/model_doc/longformer.html)** (from AllenAI) released with the paper [Longformer: The Long-Document Transformer](https://arxiv.org/abs/2004.05150) by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+1. **[LXMERT](https://huggingface.co/transformers/model_doc/lxmert.html)** (from UNC Chapel Hill) released with the paper [LXMERT: Learning Cross-Modality Encoder Representations from Transformers for Open-Domain Question Answering](https://arxiv.org/abs/1908.07490) by Hao Tan and Mohit Bansal.
+1. **[MarianMT](https://huggingface.co/transformers/model_doc/marian.html)** Machine translation models trained using [OPUS](http://opus.nlpl.eu/) data by Jörg Tiedemann. The [Marian Framework](https://marian-nmt.github.io/) is being developed by the Microsoft Translator Team.
+1. **[MBart](https://huggingface.co/transformers/model_doc/mbart.html)** (from Facebook) released with the paper [Multilingual Denoising Pre-training for Neural Machine Translation](https://arxiv.org/abs/2001.08210) by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+1. **[MPNet](https://huggingface.co/transformers/model_doc/mpnet.html)** (from Microsoft Research) released with the paper [MPNet: Masked and Permuted Pre-training for Language Understanding](https://arxiv.org/abs/2004.09297) by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+1. **[MT5](https://huggingface.co/transformers/model_doc/mt5.html)** (from Google AI) released with the paper [mT5: A massively multilingual pre-trained text-to-text transformer](https://arxiv.org/abs/2010.11934) by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+1. **[Pegasus](https://huggingface.co/transformers/model_doc/pegasus.html)** (from Google) released with the paper [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization](https://arxiv.org/abs/1912.08777)> by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
+1. **[ProphetNet](https://huggingface.co/transformers/model_doc/prophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[Reformer](https://huggingface.co/transformers/model_doc/reformer.html)** (from Google Research) released with the paper [Reformer: The Efficient Transformer](https://arxiv.org/abs/2001.04451) by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+1. **[RoBERTa](https://huggingface.co/transformers/model_doc/roberta.html)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
+ultilingual BERT into [DistilmBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation) and a German version of DistilBERT.
+1. **[SqueezeBert](https://huggingface.co/transformers/model_doc/squeezebert.html)** released with the paper [SqueezeBERT: What can computer vision teach NLP about efficient neural networks?](https://arxiv.org/abs/2006.11316) by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, and Kurt W. Keutzer.
+1. **[T5](https://huggingface.co/transformers/model_doc/t5.html)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+1. **[TAPAS](https://huggingface.co/transformers/model_doc/tapas.html)** (from Google AI) released with the paper [TAPAS: Weakly Supervised Table Parsing via Pre-training](https://arxiv.org/abs/2004.02349) by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller, Francesco Piccinno and Julian Martin Eisenschlos.
+1. **[Transformer-XL](https://huggingface.co/transformers/model_doc/transformerxl.html)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+1. **[XLM](https://huggingface.co/transformers/model_doc/xlm.html)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+1. **[XLM-ProphetNet](https://huggingface.co/transformers/model_doc/xlmprophetnet.html)** (from Microsoft Research) released with the paper [ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training](https://arxiv.org/abs/2001.04063) by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+1. **[XLM-RoBERTa](https://huggingface.co/transformers/model_doc/xlmroberta.html)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+1. **[XLNet](https://huggingface.co/transformers/model_doc/xlnet.html)** (from Google/CMU) released with the paper [XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+1. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+
+To check if each model has an implementation in PyTorch/TensorFlow/Flax or has an associated tokenizer backed by the 🤗 Tokenizers library, refer to [this table](https://huggingface.co/transformers/index.html#bigtable)

 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations. You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).

@@ -204,13 +251,17 @@ These implementations have been tested on several datasets (see the example scri

 ## Citation

-We now have a [paper](https://arxiv.org/abs/1910.03771) you can cite for the 🤗 Transformers library:
+We now have a [paper](https://www.aclweb.org/anthology/2020.emnlp-demos.6/) you can cite for the 🤗 Transformers library:
 ```bibtex
-@article{Wolf2019HuggingFacesTS,
-  title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
-  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush},
-  journal={ArXiv},
-  year={2019},
-  volume={abs/1910.03771}
+@inproceedings{wolf-etal-2020-transformers,
+    title = "Transformers: State-of-the-Art Natural Language Processing",
+    author = "Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Joe Davison and Sam Shleifer and Patrick von Platen and Clara Ma and Yacine Jernite and Julien Plu and Canwen Xu and Teven Le Scao and Sylvain Gugger and Mariama Drame and Quentin Lhoest and Alexander M. Rush",
+    booktitle = "Proceedings of the 2020 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
+    month = oct,
+    year = "2020",
+    address = "Online",
+    publisher = "Association for Computational Linguistics",
+    url = "https://www.aclweb.org/anthology/2020.emnlp-demos.6",
+    pages = "38--45"
 }
 ```
--- a/codecov.yml
+++ b/codecov.yml
@@ -1,10 +0,0 @@
-coverage:
-  status:
-    project:
-      default:
-        informational: true
-    patch: off
-comment:
-  require_changes: true    # only comment if there was change in coverage
-  require_head: yes        # don't report if there is no head coverage report
-  require_base: yes        # don't report if there is no base coverage report
--- a/docker/transformers-gpu/Dockerfile
+++ b/docker/transformers-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
 LABEL maintainer="Hugging Face"
 LABEL repository="transformers"

@@ -18,9 +18,14 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
    tensorflow \
    torch

+RUN git clone https://github.com/NVIDIA/apex
+RUN cd apex && \
+    python3 setup.py install && \
+    pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+
 WORKDIR /workspace
 COPY . transformers/
 RUN cd transformers/ && \
    python3 -m pip install --no-cache-dir .

-CMD ["/bin/bash"]
+CMD ["/bin/bash"]
--- a/docker/transformers-pytorch-gpu/Dockerfile
+++ b/docker/transformers-pytorch-gpu/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvidia/cuda:10.1-cudnn7-runtime-ubuntu18.04
+FROM nvidia/cuda:10.2-cudnn7-devel-ubuntu18.04
 LABEL maintainer="Hugging Face"
 LABEL repository="transformers"

@@ -17,9 +17,14 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip && \
    mkl \
    torch

+RUN git clone https://github.com/NVIDIA/apex
+RUN cd apex && \
+    python3 setup.py install && \
+    pip install -v --no-cache-dir --global-option="--cpp_ext" --global-option="--cuda_ext" ./
+
 WORKDIR /workspace
 COPY . transformers/
 RUN cd transformers/ && \
    python3 -m pip install --no-cache-dir .

-CMD ["/bin/bash"]
+CMD ["/bin/bash"]
--- a/docs/README.md
+++ b/docs/README.md
@@ -1,3 +1,19 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 # Generating the documentation

 To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -2,6 +2,15 @@

 /* Colab dropdown */

+table.center-aligned-table td {
+    text-align: center;
+}
+
+table.center-aligned-table th {
+    text-align: center;
+    vertical-align: middle;
+}
+
 .colab-dropdown {
    position: relative;
    display: inline-block;
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -1,11 +1,16 @@
 // These two things need to be updated at each release for the version selector.
 // Last stable version
-const stableVersion = "v3.2.0"
-// Dictionary doc folder to label
+const stableVersion = "v4.1.1"
+// Dictionary doc folder to label. The last stable version should have an empty key.
 const versionMapping = {
    "master": "master",
-    "": "v3.2.0",
-    "v3.1.0": "v3.1.0 (stable)",
+    "": "v4.1.1 (stable)",
+    "v4.0.1": "v4.0.0/v4.0.1",
+    "v3.5.1": "v3.5.0/v3.5.1",
+    "v3.4.0": "v3.4.0",
+    "v3.3.1": "v3.3.0/v3.3.1",
+    "v3.2.0": "v3.2.0",
+    "v3.1.0": "v3.1.0",
    "v3.0.2": "v3.0.0/v3.0.1/v3.0.2",
    "v2.11.0": "v2.11.0",
    "v2.10.0": "v2.10.0",
@@ -235,9 +240,11 @@ function platformToggle() {

    const createFrameworkButtons = sample => {
            const pytorchButton = document.createElement("button");
+            pytorchButton.classList.add('pytorch-button')
            pytorchButton.innerText = "PyTorch";

            const tensorflowButton = document.createElement("button");
+            tensorflowButton.classList.add('tensorflow-button')
            tensorflowButton.innerText = "TensorFlow";

            const selectorDiv = document.createElement("div");
@@ -252,22 +259,36 @@ function platformToggle() {
            tensorflowButton.classList.remove("selected");

            pytorchButton.addEventListener("click", () => {
-                sample.element.innerHTML = sample.pytorchSample;
-                pytorchButton.classList.add("selected");
-                tensorflowButton.classList.remove("selected");
+                for(const codeBlock of updatedCodeBlocks){
+                    codeBlock.element.innerHTML = codeBlock.pytorchSample;
+                }
+                Array.from(document.getElementsByClassName('pytorch-button')).forEach(button => {
+                    button.classList.add("selected");
+                })
+                Array.from(document.getElementsByClassName('tensorflow-button')).forEach(button => {
+                    button.classList.remove("selected");
+                })
            });
            tensorflowButton.addEventListener("click", () => {
-               sample.element.innerHTML = sample.tensorflowSample;
-                tensorflowButton.classList.add("selected");
-                pytorchButton.classList.remove("selected");
+                for(const codeBlock of updatedCodeBlocks){
+                    codeBlock.element.innerHTML = codeBlock.tensorflowSample;
+                }
+                Array.from(document.getElementsByClassName('tensorflow-button')).forEach(button => {
+                    button.classList.add("selected");
+                })
+                Array.from(document.getElementsByClassName('pytorch-button')).forEach(button => {
+                    button.classList.remove("selected");
+                })
            });
        };

-    codeBlocks
+    const updatedCodeBlocks = codeBlocks
        .map(element => {return {element: element.firstChild, innerText: element.innerText}})
        .filter(codeBlock => codeBlock.innerText.includes(pytorchIdentifier) && codeBlock.innerText.includes(tensorflowIdentifier))
        .map(getFrameworkSpans)
-        .forEach(createFrameworkButtons);
+
+    updatedCodeBlocks
+        .forEach(createFrameworkButtons)
 }


--- a/docs/source/benchmarks.rst
+++ b/docs/source/benchmarks.rst
@@ -1,23 +1,41 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Benchmarks
 =======================================================================================================================

 Let's take a look at how 🤗 Transformer models can be benchmarked, best practices, and already available benchmarks.

-A notebook explaining in more detail how to benchmark 🤗 Transformer models can be found `here <https://github.com/huggingface/transformers/blob/master/notebooks/05-benchmark.ipynb>`__.
+A notebook explaining in more detail how to benchmark 🤗 Transformer models can be found :prefix_link:`here
+<notebooks/05-benchmark.ipynb>`.

 How to benchmark 🤗 Transformer models
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The classes :class:`~transformers.PyTorchBenchmark` and :class:`~transformers.TensorFlowBenchmark` allow to flexibly benchmark 🤗 Transformer models.
-The benchmark classes allow us to measure the `peak memory usage` and `required time` for both 
-`inference` and `training`. 
+The classes :class:`~transformers.PyTorchBenchmark` and :class:`~transformers.TensorFlowBenchmark` allow to flexibly
+benchmark 🤗 Transformer models. The benchmark classes allow us to measure the `peak memory usage` and `required time`
+for both `inference` and `training`.

 .. note::

-  Hereby, `inference` is defined by a single forward pass, and `training` is defined by a single forward pass and backward pass.
+  Hereby, `inference` is defined by a single forward pass, and `training` is defined by a single forward pass and
+  backward pass.

-The benchmark classes :class:`~transformers.PyTorchBenchmark` and :class:`~transformers.TensorFlowBenchmark` expect an object of type :class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments`, respectively, for instantiation. :class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments` are data classes and contain all relevant configurations for their corresponding benchmark class.
-In the following example, it is shown how a BERT model of type `bert-base-cased` can be benchmarked.
+The benchmark classes :class:`~transformers.PyTorchBenchmark` and :class:`~transformers.TensorFlowBenchmark` expect an
+object of type :class:`~transformers.PyTorchBenchmarkArguments` and
+:class:`~transformers.TensorFlowBenchmarkArguments`, respectively, for instantiation.
+:class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments` are data
+classes and contain all relevant configurations for their corresponding benchmark class. In the following example, it
+is shown how a BERT model of type `bert-base-cased` can be benchmarked.

 .. code-block::

@@ -34,11 +52,15 @@ In the following example, it is shown how a BERT model of type `bert-base-cased`
    >>> benchmark = TensorFlowBenchmark(args)


-Here, three arguments are given to the benchmark argument data classes, namely ``models``, ``batch_sizes``, and ``sequence_lengths``. The argument ``models`` is required and expects a :obj:`list` of model identifiers from the `model hub <https://huggingface.co/models>`__
-The :obj:`list` arguments ``batch_sizes`` and ``sequence_lengths`` define the size of the ``input_ids`` on which the model is benchmarked. 
-There are many more parameters that can be configured via the benchmark argument data classes. For more detail on these one can either directly consult the files 
-``src/transformers/benchmark/benchmark_args_utils.py``, ``src/transformers/benchmark/benchmark_args.py`` (for PyTorch) and ``src/transformers/benchmark/benchmark_args_tf.py`` (for Tensorflow). 
-Alternatively, running the following shell commands from root will print out a descriptive list of all configurable parameters for PyTorch and Tensorflow respectively.
+Here, three arguments are given to the benchmark argument data classes, namely ``models``, ``batch_sizes``, and
+``sequence_lengths``. The argument ``models`` is required and expects a :obj:`list` of model identifiers from the
+`model hub <https://huggingface.co/models>`__ The :obj:`list` arguments ``batch_sizes`` and ``sequence_lengths`` define
+the size of the ``input_ids`` on which the model is benchmarked. There are many more parameters that can be configured
+via the benchmark argument data classes. For more detail on these one can either directly consult the files
+``src/transformers/benchmark/benchmark_args_utils.py``, ``src/transformers/benchmark/benchmark_args.py`` (for PyTorch)
+and ``src/transformers/benchmark/benchmark_args_tf.py`` (for Tensorflow). Alternatively, running the following shell
+commands from root will print out a descriptive list of all configurable parameters for PyTorch and Tensorflow
+respectively.

 .. code-block:: bash

@@ -65,7 +87,7 @@ An instantiated benchmark object can then simply be run by calling ``benchmark.r
    bert-base-uncased          8              128            0.018     
    bert-base-uncased          8              512            0.088     
    --------------------------------------------------------------------------------
-    
+
    ====================      INFERENCE - MEMORY - RESULT       ====================
    --------------------------------------------------------------------------------
    Model Name             Batch Size     Seq Length    Memory in MB 
@@ -75,8 +97,9 @@ An instantiated benchmark object can then simply be run by calling ``benchmark.r
    bert-base-uncased          8              128            1307
    bert-base-uncased          8              512            1539
    --------------------------------------------------------------------------------
-    
+
    ====================        ENVIRONMENT INFORMATION         ====================
+
    - transformers_version: 2.11.0
    - framework: PyTorch
    - use_torchscript: False
@@ -98,7 +121,7 @@ An instantiated benchmark object can then simply be run by calling ``benchmark.r
    - gpu_power_watts: 280.0
    - gpu_performance_state: 2
    - use_tpu: False
-    
+
    >>> ## TENSORFLOW CODE
    >>> results = benchmark.run()
    >>> print(results)
@@ -111,7 +134,7 @@ An instantiated benchmark object can then simply be run by calling ``benchmark.r
    bert-base-uncased          8              128            0.022
    bert-base-uncased          8              512            0.105
    --------------------------------------------------------------------------------
-    
+
    ====================      INFERENCE - MEMORY - RESULT       ====================
    --------------------------------------------------------------------------------
    Model Name             Batch Size     Seq Length    Memory in MB 
@@ -121,8 +144,9 @@ An instantiated benchmark object can then simply be run by calling ``benchmark.r
    bert-base-uncased          8              128            1330
    bert-base-uncased          8              512            1770
    --------------------------------------------------------------------------------
-    
+
    ====================        ENVIRONMENT INFORMATION         ====================
+
    - transformers_version: 2.11.0
    - framework: Tensorflow
    - use_xla: False
@@ -145,14 +169,17 @@ An instantiated benchmark object can then simply be run by calling ``benchmark.r
    - gpu_performance_state: 2
    - use_tpu: False

-By default, the `time` and the `required memory` for `inference` are benchmarked. 
-In the example output above the first two sections show the result corresponding to `inference time` and `inference memory`. 
-In addition, all relevant information about the computing environment, `e.g.` the GPU type, the system, the library versions, etc... are printed out in the third section under `ENVIRONMENT INFORMATION`.
-This information can optionally be saved in a `.csv` file when adding the argument :obj:`save_to_csv=True` to :class:`~transformers.PyTorchBenchmarkArguments` and :class:`~transformers.TensorFlowBenchmarkArguments` respectively.
-In this case, every section is saved in a separate `.csv` file. The path to each `.csv` file can optionally be defined via the argument data classes.
+By default, the `time` and the `required memory` for `inference` are benchmarked. In the example output above the first
+two sections show the result corresponding to `inference time` and `inference memory`. In addition, all relevant
+information about the computing environment, `e.g.` the GPU type, the system, the library versions, etc... are printed
+out in the third section under `ENVIRONMENT INFORMATION`. This information can optionally be saved in a `.csv` file
+when adding the argument :obj:`save_to_csv=True` to :class:`~transformers.PyTorchBenchmarkArguments` and
+:class:`~transformers.TensorFlowBenchmarkArguments` respectively. In this case, every section is saved in a separate
+`.csv` file. The path to each `.csv` file can optionally be defined via the argument data classes.

-Instead of benchmarking pre-trained models via their model identifier, `e.g.` `bert-base-uncased`, the user can alternatively benchmark an arbitrary configuration of any available model class. 
-In this case, a :obj:`list` of configurations must be inserted with the benchmark args as follows.
+Instead of benchmarking pre-trained models via their model identifier, `e.g.` `bert-base-uncased`, the user can
+alternatively benchmark an arbitrary configuration of any available model class. In this case, a :obj:`list` of
+configurations must be inserted with the benchmark args as follows.

 .. code-block::

@@ -183,7 +210,7 @@ In this case, a :obj:`list` of configurations must be inserted with the benchmar
    bert-6-lay                 8              128            0.009     
    bert-6-lay                 8              512            0.044
    --------------------------------------------------------------------------------
-    
+
    ====================      INFERENCE - MEMORY - RESULT       ====================
    --------------------------------------------------------------------------------
    Model Name             Batch Size     Seq Length      Memory in MB 
@@ -201,8 +228,9 @@ In this case, a :obj:`list` of configurations must be inserted with the benchmar
    bert-6-lay                 8              128            1127     
    bert-6-lay                 8              512            1359
    --------------------------------------------------------------------------------
-    
+
    ====================        ENVIRONMENT INFORMATION         ====================
+
    - transformers_version: 2.11.0
    - framework: PyTorch
    - use_torchscript: False
@@ -252,7 +280,7 @@ In this case, a :obj:`list` of configurations must be inserted with the benchmar
    bert-6-lay                 8              128            0.0011
    bert-6-lay                 8              512            0.074
    --------------------------------------------------------------------------------
-    
+
    ====================      INFERENCE - MEMORY - RESULT       ====================
    --------------------------------------------------------------------------------
    Model Name             Batch Size     Seq Length      Memory in MB 
@@ -270,8 +298,9 @@ In this case, a :obj:`list` of configurations must be inserted with the benchmar
    bert-6-lay                 8              128            1330
    bert-6-lay                 8              512            1540
    --------------------------------------------------------------------------------
-    
+
    ====================        ENVIRONMENT INFORMATION         ====================
+
    - transformers_version: 2.11.0
    - framework: Tensorflow
    - use_xla: False
@@ -295,8 +324,9 @@ In this case, a :obj:`list` of configurations must be inserted with the benchmar
    - use_tpu: False


-Again, `inference time` and `required memory` for `inference` are measured, but this time for customized configurations of the :obj:`BertModel` class. This feature can especially be helpful when 
-deciding for which configuration the model should be trained.
+Again, `inference time` and `required memory` for `inference` are measured, but this time for customized configurations
+of the :obj:`BertModel` class. This feature can especially be helpful when deciding for which configuration the model
+should be trained.


 Benchmark best practices
@@ -304,19 +334,28 @@ Benchmark best practices

 This section lists a couple of best practices one should be aware of when benchmarking a model.

- Currently, only single device benchmarking is supported. When benchmarking on GPU, it is recommended that the user 
-  specifies on which device the code should be run by setting the ``CUDA_VISIBLE_DEVICES`` environment variable in the shell, `e.g.` ``export CUDA_VISIBLE_DEVICES=0`` before running the code.
- The option :obj:`no_multi_processing` should only be set to :obj:`True` for testing and debugging. To ensure accurate memory measurement it is recommended to run each memory benchmark in a separate process by making sure :obj:`no_multi_processing` is set to :obj:`True`.
- One should always state the environment information when sharing the results of a model benchmark. Results can vary heavily between different GPU devices, library versions, etc., so that benchmark results on their own are not very useful for the community.
+- Currently, only single device benchmarking is supported. When benchmarking on GPU, it is recommended that the user
+  specifies on which device the code should be run by setting the ``CUDA_VISIBLE_DEVICES`` environment variable in the
+  shell, `e.g.` ``export CUDA_VISIBLE_DEVICES=0`` before running the code.
+- The option :obj:`no_multi_processing` should only be set to :obj:`True` for testing and debugging. To ensure accurate
+  memory measurement it is recommended to run each memory benchmark in a separate process by making sure
+  :obj:`no_multi_processing` is set to :obj:`True`.
+- One should always state the environment information when sharing the results of a model benchmark. Results can vary
+  heavily between different GPU devices, library versions, etc., so that benchmark results on their own are not very
+  useful for the community.


 Sharing your benchmark
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Previously all available core models (10 at the time) have been benchmarked for `inference time`, across many different settings: using PyTorch, with
-and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for
-TensorFlow XLA) and GPUs.
+Previously all available core models (10 at the time) have been benchmarked for `inference time`, across many different
+settings: using PyTorch, with and without TorchScript, using TensorFlow, with and without XLA. All of those tests were
+done across CPUs (except for TensorFlow XLA) and GPUs.

-The approach is detailed in the `following blogpost <https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2>`__ and the results are available `here <https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing>`__.
+The approach is detailed in the `following blogpost
+<https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2>`__ and the results are
+available `here
+<https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing>`__.

-With the new `benchmark` tools, it is easier than ever to share your benchmark results with the community `here <https://github.com/huggingface/transformers/blob/master/examples/benchmarking/README.md>`__.
+With the new `benchmark` tools, it is easier than ever to share your benchmark results with the community
+:prefix_link:`here <examples/benchmarking/README.md>`.
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@@ -1,18 +1,38 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 BERTology
 -----------------------------------------------------------------------------------------------------------------------

-There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
+There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT
+(that some call "BERTology"). Some good examples of this field are:


-* BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
+* BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick:
+  https://arxiv.org/abs/1905.05950
 * Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
-* What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
+* What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D.
+  Manning: https://arxiv.org/abs/1906.04341

-In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to
+help people access the inner representations, mainly adapted from the great work of Paul Michel
+(https://arxiv.org/abs/1905.10650):


 * accessing all the hidden-states of BERT/GPT/GPT-2,
 * accessing all the attention weights for each head of BERT/GPT/GPT-2,
-* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
+* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained
+  in https://arxiv.org/abs/1905.10650.

-To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/transformers/blob/master/examples/bertology/run_bertology.py>`_ while extract information and prune a model pre-trained on GLUE.
+To help you understand and use these features, we have added a specific example script: :prefix_link:`bertology.py
+<examples/research_projects/bertology/run_bertology.py>` while extract information and prune a model pre-trained on
+GLUE.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -20,14 +20,17 @@ sys.path.insert(0, os.path.abspath('../../src'))
 # -- Project information -----------------------------------------------------

 project = u'transformers'
-copyright = u'2020, huggingface'
+copyright = u'2020, The Hugging Face Team, Licenced under the Apache License, Version 2.0'
 author = u'huggingface'

 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'3.3.0'
-
+release = u'4.2.0'
+# Prefix link to point to master, comment this during version release and uncomment below line
+extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/master/%s', '')}
+# Prefix link to always point to corresponding version, uncomment this during version release
+# extlinks = {'prefix_link': ('https://github.com/huggingface/transformers/blob/v'+ release + '/%s', '')}

 # -- General configuration ---------------------------------------------------

@@ -40,6 +43,7 @@ release = u'3.3.0'
 # ones.
 extensions = [
    'sphinx.ext.autodoc',
+    'sphinx.ext.extlinks',
    'sphinx.ext.coverage',
    'sphinx.ext.napoleon',
    'recommonmark',
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -1,24 +1,51 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Converting Tensorflow Checkpoints
 =======================================================================================================================

-A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models than be loaded using the ``from_pretrained`` methods of the library.
+A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models
+than be loaded using the ``from_pretrained`` methods of the library.

 .. note::
-    Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**)
-    available in any transformers >= 2.3.0 installation.
+    Since 2.3.0 the conversion script is now part of the transformers CLI (**transformers-cli**) available in any
+    transformers >= 2.3.0 installation.

    The documentation below reflects the **transformers-cli convert** command format.

 BERT
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_bert_original_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>`_ script.
+You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google
+<https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the
+:prefix_link:`convert_bert_original_tf_checkpoint_to_pytorch.py
+<src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>` script.

-This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
+This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated
+configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights
+from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that
+can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py
+<https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ ,
+`run_bert_classifier.py
+<https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and
+`run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\
+).

-You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\ ``bert_config.json``\ ) and the vocabulary file (\ ``vocab.txt``\ ) as these are needed for the PyTorch model too.
+You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow
+checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\
+``bert_config.json``\ ) and the vocabulary file (\ ``vocab.txt``\ ) as these are needed for the PyTorch model too.

-To run this specific conversion script you will need to have TensorFlow and PyTorch installed (\ ``pip install tensorflow``\ ). The rest of the repository only requires PyTorch.
+To run this specific conversion script you will need to have TensorFlow and PyTorch installed (\ ``pip install
+tensorflow``\ ). The rest of the repository only requires PyTorch.

 Here is an example of the conversion process for a pre-trained ``BERT-Base Uncased`` model:

@@ -31,14 +58,19 @@ Here is an example of the conversion process for a pre-trained ``BERT-Base Uncas
     --config $BERT_BASE_DIR/bert_config.json \
     --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin

-You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/bert#pre-trained-models>`__.
+You can download Google's pre-trained models for the conversion `here
+<https://github.com/google-research/bert#pre-trained-models>`__.

 ALBERT
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Convert TensorFlow model checkpoints of ALBERT to PyTorch using the `convert_albert_original_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/transformers/blob/master/src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>`_ script.
+Convert TensorFlow model checkpoints of ALBERT to PyTorch using the
+:prefix_link:`convert_albert_original_tf_checkpoint_to_pytorch.py
+<src/transformers/convert_bert_original_tf_checkpoint_to_pytorch.py>` script.

-The CLI takes as input a TensorFlow checkpoint (three files starting with ``model.ckpt-best``\ ) and the accompanying configuration file (\ ``albert_config.json``\ ), then creates and saves a PyTorch model. To run this conversion you will need to have TensorFlow and PyTorch installed.
+The CLI takes as input a TensorFlow checkpoint (three files starting with ``model.ckpt-best``\ ) and the accompanying
+configuration file (\ ``albert_config.json``\ ), then creates and saves a PyTorch model. To run this conversion you
+will need to have TensorFlow and PyTorch installed.

 Here is an example of the conversion process for the pre-trained ``ALBERT Base`` model:

@@ -51,12 +83,15 @@ Here is an example of the conversion process for the pre-trained ``ALBERT Base``
     --config $ALBERT_BASE_DIR/albert_config.json \
     --pytorch_dump_output $ALBERT_BASE_DIR/pytorch_model.bin

-You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/albert#pre-trained-models>`__.
+You can download Google's pre-trained models for the conversion `here
+<https://github.com/google-research/albert#pre-trained-models>`__.

 OpenAI GPT
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint save as the same format than OpenAI pretrained model (see `here <https://github.com/openai/finetune-transformer-lm>`__\ )
+Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint
+save as the same format than OpenAI pretrained model (see `here <https://github.com/openai/finetune-transformer-lm>`__\
+)

 .. code-block:: shell

@@ -72,7 +107,8 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
 OpenAI GPT-2
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here <https://github.com/openai/gpt-2>`__\ )
+Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here
+<https://github.com/openai/gpt-2>`__\ )

 .. code-block:: shell

@@ -87,7 +123,8 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT-2 mode
 Transformer-XL
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here <https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models>`__\ )
+Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here
+<https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models>`__\ )

 .. code-block:: shell

@@ -130,4 +167,4 @@ Here is an example of the conversion process for a pre-trained XLM model:
     --tf_checkpoint $XLM_CHECKPOINT_PATH \
     --pytorch_dump_output $PYTORCH_DUMP_OUTPUT
    [--config XML_CONFIG] \
-    [--finetuning_task_name XML_FINETUNED_TASK]
+    [--finetuning_task_name XML_FINETUNED_TASK]
--- a/docs/source/custom_datasets.rst
+++ b/docs/source/custom_datasets.rst
@@ -1,17 +1,29 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Fine-tuning with custom datasets
 =======================================================================================================================

 .. note::

-    The datasets used in this tutorial are available and can be more easily accessed using the
-    `🤗 NLP library <https://github.com/huggingface/nlp>`_. We do not use this library to access the datasets here
-    since this tutorial meant to illustrate how to work with your own data. A brief of introduction can be found
-    at the end of the tutorial in the section ":ref:`nlplib`".
+    The datasets used in this tutorial are available and can be more easily accessed using the `🤗 NLP library
+    <https://github.com/huggingface/nlp>`_. We do not use this library to access the datasets here since this tutorial
+    meant to illustrate how to work with your own data. A brief of introduction can be found at the end of the tutorial
+    in the section ":ref:`nlplib`".

-This tutorial will take you through several examples of using 🤗 Transformers models with your own datasets. The
-guide shows one of many valid workflows for using these models and is meant to be illustrative rather than
-definitive. We show examples of reading in several data formats, preprocessing the data for several types of tasks,
-and then preparing the data into PyTorch/TensorFlow ``Dataset`` objects which can easily be used either with
+This tutorial will take you through several examples of using 🤗 Transformers models with your own datasets. The guide
+shows one of many valid workflows for using these models and is meant to be illustrative rather than definitive. We
+show examples of reading in several data formats, preprocessing the data for several types of tasks, and then preparing
+the data into PyTorch/TensorFlow ``Dataset`` objects which can easily be used either with
 :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` or with native PyTorch/TensorFlow.

 We include several examples, each of which demonstrates a different type of common downstream task:
@@ -28,13 +40,13 @@ Sequence Classification with IMDb Reviews

 .. note::

-    This dataset can be explored in the Hugging Face model hub (`IMDb <https://huggingface.co/datasets/imdb>`_), and can
-    be alternatively downloaded with the 🤗 NLP library with ``load_dataset("imdb")``.
+    This dataset can be explored in the Hugging Face model hub (`IMDb <https://huggingface.co/datasets/imdb>`_), and
+    can be alternatively downloaded with the 🤗 NLP library with ``load_dataset("imdb")``.

-In this example, we'll show how to download, tokenize, and train a model on the IMDb reviews dataset. This task
-takes the text of a review and requires the model to predict whether the sentiment of the review is positive or
-negative. Let's start by downloading the dataset from the
-`Large Movie Review Dataset <http://ai.stanford.edu/~amaas/data/sentiment/>`_ webpage.
+In this example, we'll show how to download, tokenize, and train a model on the IMDb reviews dataset. This task takes
+the text of a review and requires the model to predict whether the sentiment of the review is positive or negative.
+Let's start by downloading the dataset from the `Large Movie Review Dataset
+<http://ai.stanford.edu/~amaas/data/sentiment/>`_ webpage.

 .. code-block:: bash

@@ -62,9 +74,8 @@ read this in.
    train_texts, train_labels = read_imdb_split('aclImdb/train')
    test_texts, test_labels = read_imdb_split('aclImdb/test')

-We now have a train and test dataset, but let's also also create a validation set which we can use for for
-evaluation and tuning without training our test set results. Sklearn has a convenient utility for creating such
-splits:
+We now have a train and test dataset, but let's also also create a validation set which we can use for for evaluation
+and tuning without training our test set results. Sklearn has a convenient utility for creating such splits:

 .. code-block:: python

@@ -80,8 +91,8 @@ pre-trained DistilBert, so let's use the DistilBert tokenizer.
    tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

 Now we can simply pass our texts to the tokenizer. We'll pass ``truncation=True`` and ``padding=True``, which will
-ensure that all of our sequences are padded to the same length and are truncated to be no longer model's maximum
-input length. This will allow us to feed batches of sequences into the model at the same time.
+ensure that all of our sequences are padded to the same length and are truncated to be no longer model's maximum input
+length. This will allow us to feed batches of sequences into the model at the same time.

 .. code-block:: python

@@ -90,9 +101,9 @@ input length. This will allow us to feed batches of sequences into the model at
    test_encodings = tokenizer(test_texts, truncation=True, padding=True)

 Now, let's turn our labels and encodings into a Dataset object. In PyTorch, this is done by subclassing a
-``torch.utils.data.Dataset`` object and implementing ``__len__`` and ``__getitem__``. In TensorFlow, we pass our input encodings and
-labels to the ``from_tensor_slices`` constructor method. We put the data in this format so that the data can be
-easily batched such that each key in the batch encoding corresponds to a named parameter of the
+``torch.utils.data.Dataset`` object and implementing ``__len__`` and ``__getitem__``. In TensorFlow, we pass our input
+encodings and labels to the ``from_tensor_slices`` constructor method. We put the data in this format so that the data
+can be easily batched such that each key in the batch encoding corresponds to a named parameter of the
 :meth:`~transformers.DistilBertForSequenceClassification.forward` method of the model we will train.

 .. code-block:: python
@@ -133,17 +144,17 @@ easily batched such that each key in the batch encoding corresponds to a named p
    ))

 Now that our datasets our ready, we can fine-tune a model either with the 🤗
-:class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` or with native PyTorch/TensorFlow. See
-:doc:`training <training>`.
+:class:`~transformers.Trainer`/:class:`~transformers.TFTrainer` or with native PyTorch/TensorFlow. See :doc:`training
+<training>`.

 .. _ft_trainer:

 Fine-tuning with Trainer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The steps above prepared the datasets in the way that the trainer is expected. Now all we need to do is create a
-model to fine-tune, define the :class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments`
-and instantiate a :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`.
+The steps above prepared the datasets in the way that the trainer is expected. Now all we need to do is create a model
+to fine-tune, define the :class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments` and
+instantiate a :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`.

 .. code-block:: python

@@ -248,15 +259,15 @@ Token Classification with W-NUT Emerging Entities

 .. note::

-    This dataset can be explored in the Hugging Face model hub (`WNUT-17 <https://huggingface.co/datasets/wnut_17>`_), and can
-    be alternatively downloaded with the 🤗 NLP library with ``load_dataset("wnut_17")``.
+    This dataset can be explored in the Hugging Face model hub (`WNUT-17 <https://huggingface.co/datasets/wnut_17>`_),
+    and can be alternatively downloaded with the 🤗 NLP library with ``load_dataset("wnut_17")``.

 Next we will look at token classification. Rather than classifying an entire sequence, this task classifies token by
-token. We'll demonstrate how to do this with 
-`Named Entity Recognition <http://nlpprogress.com/english/named_entity_recognition.html>`_, which involves
-identifying tokens which correspond to a predefined set of "entities". Specifically, we'll use the
-`W-NUT Emerging and Rare entities <http://noisy-text.github.io/2017/emerging-rare-entities.html>`_ corpus. The data
-is given as a collection of pre-tokenized documents where each token is assigned a tag.
+token. We'll demonstrate how to do this with `Named Entity Recognition
+<http://nlpprogress.com/english/named_entity_recognition.html>`_, which involves identifying tokens which correspond to
+a predefined set of "entities". Specifically, we'll use the `W-NUT Emerging and Rare entities
+<http://noisy-text.github.io/2017/emerging-rare-entities.html>`_ corpus. The data is given as a collection of
+pre-tokenized documents where each token is assigned a tag.

 Let's start by downloading the data.

@@ -264,10 +275,10 @@ Let's start by downloading the data.

    wget http://noisy-text.github.io/2017/files/wnut17train.conll

-In this case, we'll just download the train set, which is a single text file. Each line of the file contains either
-(1) a word and tag separated by a tab, or (2) a blank line indicating the end of a document. Let's write a
-function to read this in. We'll take in the file path and return ``token_docs`` which is a list of lists of token
-strings, and ``token_tags`` which is a list of lists of tag strings.
+In this case, we'll just download the train set, which is a single text file. Each line of the file contains either (1)
+a word and tag separated by a tab, or (2) a blank line indicating the end of a document. Let's write a function to read
+this in. We'll take in the file path and return ``token_docs`` which is a list of lists of token strings, and
+``token_tags`` which is a list of lists of tag strings.

 .. code-block:: python

@@ -290,11 +301,11 @@ strings, and ``token_tags`` which is a list of lists of tag strings.
                tags.append(tag)
            token_docs.append(tokens)
            tag_docs.append(tags)
-        
+
        return token_docs, tag_docs
-    
+
    texts, tags = read_wnut('wnut17train.conll')
-    
+
 Just to see what this data looks like, let's take a look at a segment of the first document.

 .. code-block:: python
@@ -303,8 +314,8 @@ Just to see what this data looks like, let's take a look at a segment of the fir
    ['for', 'two', 'weeks', '.', 'Empire', 'State', 'Building']
    ['O', 'O', 'O', 'O', 'B-location', 'I-location', 'I-location']

-``location`` is an entity type, ``B-`` indicates the beginning of an entity, and ``I-`` indicates consecutive positions of
-the same entity ("Empire State Building" is considered one entity). ``O`` indicates the token does not correspond to
+``location`` is an entity type, ``B-`` indicates the beginning of an entity, and ``I-`` indicates consecutive positions
+of the same entity ("Empire State Building" is considered one entity). ``O`` indicates the token does not correspond to
 any entity.

 Now that we've read the data in, let's create a train/validation split:
@@ -314,8 +325,8 @@ Now that we've read the data in, let's create a train/validation split:
    from sklearn.model_selection import train_test_split
    train_texts, val_texts, train_tags, val_tags = train_test_split(texts, tags, test_size=.2)

-Next, let's create encodings for our tokens and tags. For the tags, we can start by just create a simple mapping
-which we'll use in a moment:
+Next, let's create encodings for our tokens and tags. For the tags, we can start by just create a simple mapping which
+we'll use in a moment:

 .. code-block:: python

@@ -323,11 +334,11 @@ which we'll use in a moment:
    tag2id = {tag: id for id, tag in enumerate(unique_tags)}
    id2tag = {id: tag for tag, id in tag2id.items()}

-To encode the tokens, we'll use a pre-trained DistilBert tokenizer. We can tell the tokenizer that we're dealing
-with ready-split tokens rather than full sentence strings by passing ``is_split_into_words=True``. We'll also pass
-``padding=True`` and ``truncation=True`` to pad the sequences to be the same length. Lastly, we can tell the model
-to return information about the tokens which are split by the wordpiece tokenization process, which we will need in
-a moment.
+To encode the tokens, we'll use a pre-trained DistilBert tokenizer. We can tell the tokenizer that we're dealing with
+ready-split tokens rather than full sentence strings by passing ``is_split_into_words=True``. We'll also pass
+``padding=True`` and ``truncation=True`` to pad the sequences to be the same length. Lastly, we can tell the model to
+return information about the tokens which are split by the wordpiece tokenization process, which we will need in a
+moment.

 .. code-block:: python

@@ -339,26 +350,26 @@ a moment.
 Great, so now our tokens are nicely encoded in the format that they need to be in to feed them into our DistilBert
 model below.

-Now we arrive at a common obstacle with using pre-trained models for token-level classification: many of the tokens
-in the W-NUT corpus are not in DistilBert's vocabulary. Bert and many models like it use a method called WordPiece
-Tokenization, meaning that single words are split into multiple tokens such that each token is likely to be in
-the vocabulary. For example, DistilBert's tokenizer would split the Twitter handle ``@huggingface`` into the tokens
-``['@', 'hugging', '##face']``. This is a problem for us because we have exactly one tag per token. If the tokenizer
-splits a token into multiple sub-tokens, then we will end up with a mismatch between our tokens and our labels.
+Now we arrive at a common obstacle with using pre-trained models for token-level classification: many of the tokens in
+the W-NUT corpus are not in DistilBert's vocabulary. Bert and many models like it use a method called WordPiece
+Tokenization, meaning that single words are split into multiple tokens such that each token is likely to be in the
+vocabulary. For example, DistilBert's tokenizer would split the Twitter handle ``@huggingface`` into the tokens ``['@',
+'hugging', '##face']``. This is a problem for us because we have exactly one tag per token. If the tokenizer splits a
+token into multiple sub-tokens, then we will end up with a mismatch between our tokens and our labels.

-One way to handle this is to only train on the tag labels for the first subtoken of a split token. We can do this in
-🤗 Transformers by setting the labels we wish to ignore to ``-100``. In the example above, if the label for
+One way to handle this is to only train on the tag labels for the first subtoken of a split token. We can do this in 🤗
+Transformers by setting the labels we wish to ignore to ``-100``. In the example above, if the label for
 ``@HuggingFace`` is ``3`` (indexing ``B-corporation``), we would set the labels of ``['@', 'hugging', '##face']`` to
 ``[3, -100, -100]``.

 Let's write a function to do this. This is where we will use the ``offset_mapping`` from the tokenizer as mentioned
 above. For each sub-token returned by the tokenizer, the offset mapping gives us a tuple indicating the sub-token's
-start position and end position relative to the original token it was split from. That means that if the first
-position in the tuple is anything other than ``0``, we will set its corresponding label to ``-100``. While we're at
-it, we can also set labels to ``-100`` if the second position of the offset mapping is ``0``, since this means it must
-be a special token like ``[PAD]`` or ``[CLS]``.
+start position and end position relative to the original token it was split from. That means that if the first position
+in the tuple is anything other than ``0``, we will set its corresponding label to ``-100``. While we're at it, we can
+also set labels to ``-100`` if the second position of the offset mapping is ``0``, since this means it must be a
+special token like ``[PAD]`` or ``[CLS]``.

-.. note:: 
+.. note::

    Due to a recently fixed bug, -1 must be used instead of -100 when using TensorFlow in 🤗 Transformers <= 3.02.

@@ -379,7 +390,7 @@ be a special token like ``[PAD]`` or ``[CLS]``.
            encoded_labels.append(doc_enc_labels.tolist())

        return encoded_labels
-    
+
    train_labels = encode_tags(train_tags, train_encodings)
    val_labels = encode_tags(val_tags, val_encodings)

@@ -447,8 +458,9 @@ Question Answering with SQuAD 2.0

 .. note::

-    This dataset can be explored in the Hugging Face model hub (`SQuAD V2 <https://huggingface.co/datasets/squad_v2>`_), and can
-    be alternatively downloaded with the 🤗 NLP library with ``load_dataset("squad_v2")``.
+    This dataset can be explored in the Hugging Face model hub (`SQuAD V2
+    <https://huggingface.co/datasets/squad_v2>`_), and can be alternatively downloaded with the 🤗 NLP library with
+    ``load_dataset("squad_v2")``.

 Question answering comes in many forms. In this example, we'll look at the particular type of extractive QA that
 involves answering a question about a passage by highlighting the segment of the passage that answers the question.
@@ -464,8 +476,8 @@ We will start by downloading the data:
    wget https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json -O squad/dev-v2.0.json

 Each split is in a structured json file with a number of questions and answers for each passage (or context). We'll
-take this apart into parallel lists of contexts, questions, and answers (note that the contexts here are repeated
-since there are multiple questions per context):
+take this apart into parallel lists of contexts, questions, and answers (note that the contexts here are repeated since
+there are multiple questions per context):

 .. code-block:: python

@@ -491,17 +503,17 @@ since there are multiple questions per context):
                        answers.append(answer)

        return contexts, questions, answers
-    
+
    train_contexts, train_questions, train_answers = read_squad('squad/train-v2.0.json')
    val_contexts, val_questions, val_answers = read_squad('squad/dev-v2.0.json')

-The contexts and questions are just strings. The answers are dicts containing the subsequence of the passage with
-the correct answer as well as an integer indicating the character at which the answer begins. In order to train a
-model on this data we need (1) the tokenized context/question pairs, and (2) integers indicating at which *token*
-positions the answer begins and ends.
+The contexts and questions are just strings. The answers are dicts containing the subsequence of the passage with the
+correct answer as well as an integer indicating the character at which the answer begins. In order to train a model on
+this data we need (1) the tokenized context/question pairs, and (2) integers indicating at which *token* positions the
+answer begins and ends.

-First, let's get the *character* position at which the answer ends in the passage (we are given the starting
-position). Sometimes SQuAD answers are off by one or two characters, so we will also adjust for that.
+First, let's get the *character* position at which the answer ends in the passage (we are given the starting position).
+Sometimes SQuAD answers are off by one or two characters, so we will also adjust for that.

 .. code-block:: python

@@ -510,7 +522,7 @@ position). Sometimes SQuAD answers are off by one or two characters, so we will
            gold_text = answer['text']
            start_idx = answer['answer_start']
            end_idx = start_idx + len(gold_text)
-            
+
            # sometimes squad answers are off by a character or two – fix this
            if context[start_idx:end_idx] == gold_text:
                answer['answer_end'] = end_idx
@@ -524,9 +536,9 @@ position). Sometimes SQuAD answers are off by one or two characters, so we will
    add_end_idx(train_answers, train_contexts)
    add_end_idx(val_answers, val_contexts)

-Now ``train_answers`` and ``val_answers`` include the character end positions and the corrected start positions.
-Next, let's tokenize our context/question pairs. 🤗 Tokenizers can accept parallel lists of sequences and encode
-them together as sequence pairs.
+Now ``train_answers`` and ``val_answers`` include the character end positions and the corrected start positions. Next,
+let's tokenize our context/question pairs. 🤗 Tokenizers can accept parallel lists of sequences and encode them together
+as sequence pairs.

 .. code-block:: python

@@ -536,8 +548,8 @@ them together as sequence pairs.
    train_encodings = tokenizer(train_contexts, train_questions, truncation=True, padding=True)
    val_encodings = tokenizer(val_contexts, val_questions, truncation=True, padding=True)

-Next we need to convert our character start/end positions to token start/end positions. When using 🤗 Fast
-Tokenizers, we can use the built in :func:`~transformers.BatchEncoding.char_to_token` method.
+Next we need to convert our character start/end positions to token start/end positions. When using 🤗 Fast Tokenizers,
+we can use the built in :func:`~transformers.BatchEncoding.char_to_token` method.

 .. code-block:: python

@@ -546,20 +558,23 @@ Tokenizers, we can use the built in :func:`~transformers.BatchEncoding.char_to_t
        end_positions = []
        for i in range(len(answers)):
            start_positions.append(encodings.char_to_token(i, answers[i]['answer_start']))
-            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end'] - 1))
-            # if None, the answer passage has been truncated
+            end_positions.append(encodings.char_to_token(i, answers[i]['answer_end']))
+
+            # if start position is None, the answer passage has been truncated
            if start_positions[-1] is None:
                start_positions[-1] = tokenizer.model_max_length
+
+            # if end position is None, the 'char_to_token' function points to the space before the correct token - > add + 1
            if end_positions[-1] is None:
-                end_positions[-1] = tokenizer.model_max_length
+                end_positions[-1] = encodings.char_to_token(i, answers[i]['answer_end'] + 1)
        encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

    add_token_positions(train_encodings, train_answers)
    add_token_positions(val_encodings, val_answers)

-Our data is ready. Let's just put it in a PyTorch/TensorFlow dataset so that we can easily use it for
-training. In PyTorch, we define a custom ``Dataset`` class. In TensorFlow, we pass a tuple of
-``(inputs_dict, labels_dict)`` to the ``from_tensor_slices`` method.
+Our data is ready. Let's just put it in a PyTorch/TensorFlow dataset so that we can easily use it for training. In
+PyTorch, we define a custom ``Dataset`` class. In TensorFlow, we pass a tuple of ``(inputs_dict, labels_dict)`` to the
+``from_tensor_slices`` method.

 .. code-block:: python

@@ -575,7 +590,7 @@ training. In PyTorch, we define a custom ``Dataset`` class. In TensorFlow, we pa

        def __len__(self):
            return len(self.encodings.input_ids)
-        
+
    train_dataset = SquadDataset(train_encodings)
    val_dataset = SquadDataset(val_encodings)
    ## TENSORFLOW CODE
@@ -668,12 +683,11 @@ Additional Resources
 Using the 🤗 NLP Datasets & Metrics library
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-This tutorial demonstrates how to read in datasets from various raw text formats and prepare them for training with
-🤗 Transformers so that you can do the same thing with your own custom datasets. However, we recommend users use the
-`🤗 NLP library <https://github.com/huggingface/nlp>`_ for working with the 150+ datasets included in the
-`hub <https://huggingface.co/datasets>`_, including the three datasets used in this tutorial. As a very brief overview,
-we will show how to use the NLP library to download and prepare the IMDb dataset from the first example,
-:ref:`seq_imdb`.
+This tutorial demonstrates how to read in datasets from various raw text formats and prepare them for training with 🤗
+Transformers so that you can do the same thing with your own custom datasets. However, we recommend users use the `🤗
+NLP library <https://github.com/huggingface/nlp>`_ for working with the 150+ datasets included in the `hub
+<https://huggingface.co/datasets>`_, including the three datasets used in this tutorial. As a very brief overview, we
+will show how to use the NLP library to download and prepare the IMDb dataset from the first example, :ref:`seq_imdb`.

 Start by downloading the dataset:

@@ -689,8 +703,8 @@ Each dataset has multiple columns corresponding to different features. Let's see
    >>> print(train.column_names)
    ['label', 'text']

-Great. Now let's tokenize the text. We can do this using the ``map`` method. We'll also rename the ``label`` column
-to ``labels`` to match the model's input arguments.
+Great. Now let's tokenize the text. We can do this using the ``map`` method. We'll also rename the ``label`` column to
+``labels`` to match the model's input arguments.

 .. code-block:: python

@@ -711,5 +725,5 @@ dataset elements.
    >>> {key: val.shape for key, val in train[0].items()})
    {'labels': TensorShape([]), 'input_ids': TensorShape([512]), 'attention_mask': TensorShape([512])}

-We now have a fully-prepared dataset. Check out `the 🤗 NLP docs <https://huggingface.co/nlp/processing.html>`_ for
-a more thorough introduction.
+We now have a fully-prepared dataset. Check out `the 🤗 NLP docs <https://huggingface.co/nlp/processing.html>`_ for a
+more thorough introduction.
--- a/docs/source/glossary.rst
+++ b/docs/source/glossary.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Glossary
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

@@ -57,8 +69,8 @@ The tokenizer takes care of splitting the sequence into tokens available in the
    >>> tokenized_sequence = tokenizer.tokenize(sequence)

 The tokens are either words or subwords. Here for instance, "VRAM" wasn't in the model vocabulary, so it's been split
-in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-hash prefix is
-added for "RA" and "M":
+in "V", "RA" and "M". To indicate those tokens are not separate words but parts of the same word, a double-hash prefix
+is added for "RA" and "M":

 .. code-block::

@@ -66,8 +78,8 @@ added for "RA" and "M":
    ['A', 'Titan', 'R', '##T', '##X', 'has', '24', '##GB', 'of', 'V', '##RA', '##M']

 These tokens can then be converted into IDs which are understandable by the model. This can be done by directly feeding
-the sentence to the tokenizer, which leverages the Rust implementation of
-`huggingface/tokenizers <https://github.com/huggingface/tokenizers>`__ for peak performance.
+the sentence to the tokenizer, which leverages the Rust implementation of `huggingface/tokenizers
+<https://github.com/huggingface/tokenizers>`__ for peak performance.

 .. code-block::

@@ -105,8 +117,8 @@ because this is the way a :class:`~transformers.BertModel` is going to expect it
 Attention mask
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The attention mask is an optional argument used when batching sequences together. This argument indicates to the
-model which tokens should be attended to, and which should not.
+The attention mask is an optional argument used when batching sequences together. This argument indicates to the model
+which tokens should be attended to, and which should not.

 For example, consider these two sequences:

@@ -145,10 +157,10 @@ We can see that 0s have been added on the right of the first sentence to make it
    >>> padded_sequences["input_ids"]
    [[101, 1188, 1110, 170, 1603, 4954, 119, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1188, 1110, 170, 1897, 1263, 4954, 119, 1135, 1110, 1120, 1655, 2039, 1190, 1103, 4954, 138, 119, 102]]

-This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating
-the position of the padded indices so that the model does not attend to them. For the
-:class:`~transformers.BertTokenizer`, :obj:`1` indicates a value that should be attended to, while :obj:`0` indicates
-a padded value. This attention mask is in the dictionary returned by the tokenizer under the key "attention_mask":
+This can then be converted into a tensor in PyTorch or TensorFlow. The attention mask is a binary tensor indicating the
+position of the padded indices so that the model does not attend to them. For the :class:`~transformers.BertTokenizer`,
+:obj:`1` indicates a value that should be attended to, while :obj:`0` indicates a padded value. This attention mask is
+in the dictionary returned by the tokenizer under the key "attention_mask":

 .. code-block::

@@ -161,15 +173,16 @@ Token Type IDs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 Some models' purpose is to do sequence classification or question answering. These require two different sequences to
-be joined in a single "input_ids" entry, which usually is performed with the help of special tokens, such as the classifier (``[CLS]``) and separator (``[SEP]``)
-tokens. For example, the BERT model builds its two sequence input as such:
+be joined in a single "input_ids" entry, which usually is performed with the help of special tokens, such as the
+classifier (``[CLS]``) and separator (``[SEP]``) tokens. For example, the BERT model builds its two sequence input as
+such:

 .. code-block::

   >>> # [CLS] SEQUENCE_A [SEP] SEQUENCE_B [SEP]

-We can use our tokenizer to automatically generate such a sentence by passing the two sequences to ``tokenizer`` as two arguments (and
-not a list, like before) like this:
+We can use our tokenizer to automatically generate such a sentence by passing the two sequences to ``tokenizer`` as two
+arguments (and not a list, like before) like this:

 .. code-block::

@@ -189,8 +202,8 @@ which will return:
    [CLS] HuggingFace is based in NYC [SEP] Where is HuggingFace based? [SEP]

 This is enough for some models to understand where one sequence ends and where another begins. However, other models,
-such as BERT, also deploy token type IDs (also called segment IDs). They are represented as a binary
-mask identifying the two types of sequence in the model.
+such as BERT, also deploy token type IDs (also called segment IDs). They are represented as a binary mask identifying
+the two types of sequence in the model.

 The tokenizer returns this mask as the "token_type_ids" entry:

@@ -209,14 +222,59 @@ Some models, like :class:`~transformers.XLNetModel` use an additional token repr
 Position IDs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Contrary to RNNs that have the position of each token embedded within them,
-transformers are unaware of the position of each token. Therefore, the position IDs (``position_ids``) are used by the model to identify each token's position in the list of tokens.
+Contrary to RNNs that have the position of each token embedded within them, transformers are unaware of the position of
+each token. Therefore, the position IDs (``position_ids``) are used by the model to identify each token's position in
+the list of tokens.

-They are an optional parameter. If no ``position_ids`` is passed to the model, the IDs are automatically created as absolute
-positional embeddings.
+They are an optional parameter. If no ``position_ids`` are passed to the model, the IDs are automatically created as
+absolute positional embeddings.

-Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models
-use other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
+Absolute positional embeddings are selected in the range ``[0, config.max_position_embeddings - 1]``. Some models use
+other types of positional embeddings, such as sinusoidal position embeddings or relative position embeddings.
+
+.. _labels:
+
+Labels
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The labels are an optional argument which can be passed in order for the model to compute the loss itself. These labels
+should be the expected prediction of the model: it will use the standard loss in order to compute the loss between its
+predictions and the expected value (the label).
+
+These labels are different according to the model head, for example:
+
+- For sequence classification models (e.g., :class:`~transformers.BertForSequenceClassification`), the model expects a
+  tensor of dimension :obj:`(batch_size)` with each value of the batch corresponding to the expected label of the
+  entire sequence.
+- For token classification models (e.g., :class:`~transformers.BertForTokenClassification`), the model expects a tensor
+  of dimension :obj:`(batch_size, seq_length)` with each value corresponding to the expected label of each individual
+  token.
+- For masked language modeling (e.g., :class:`~transformers.BertForMaskedLM`), the model expects a tensor of dimension
+  :obj:`(batch_size, seq_length)` with each value corresponding to the expected label of each individual token: the
+  labels being the token ID for the masked token, and values to be ignored for the rest (usually -100).
+- For sequence to sequence tasks,(e.g., :class:`~transformers.BartForConditionalGeneration`,
+  :class:`~transformers.MBartForConditionalGeneration`), the model expects a tensor of dimension :obj:`(batch_size,
+  tgt_seq_length)` with each value corresponding to the target sequences associated with each input sequence. During
+  training, both `BART` and `T5` will make the appropriate `decoder_input_ids` and decoder attention masks internally.
+  They usually do not need to be supplied. This does not apply to models leveraging the Encoder-Decoder framework. See
+  the documentation of each model for more information on each specific model's labels.
+
+The base models (e.g., :class:`~transformers.BertModel`) do not accept labels, as these are the base transformer
+models, simply outputting features.
+
+.. _decoder-input-ids:
+
+Decoder input IDs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This input is specific to encoder-decoder models, and contains the input IDs that will be fed to the decoder. These
+inputs should be used for sequence to sequence tasks, such as translation or summarization, and are usually built in a
+way specific to each model.
+
+Most encoder-decoder models (BART, T5) create their :obj:`decoder_input_ids` on their own from the :obj:`labels`. In
+such models, passing the :obj:`labels` is the preferred way to handle training.
+
+Please check each model's docs to see how they handle these input IDs for sequence to sequence training.

 .. _feed-forward-chunking:

@@ -224,18 +282,18 @@ Feed Forward Chunking
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 In each residual attention block in transformers the self-attention layer is usually followed by 2 feed forward layers.
-The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g.,
-for ``bert-base-uncased``).
+The intermediate embedding size of the feed forward layers is often bigger than the hidden size of the model (e.g., for
+``bert-base-uncased``).

 For an input of size ``[batch_size, sequence_length]``, the memory required to store the intermediate feed forward
 embeddings ``[batch_size, sequence_length, config.intermediate_size]`` can account for a large fraction of the memory
 use. The authors of `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_ noticed that since the
 computation is independent of the ``sequence_length`` dimension, it is mathematically equivalent to compute the output
 embeddings of both feed forward layers ``[batch_size, config.hidden_size]_0, ..., [batch_size, config.hidden_size]_n``
-individually and concat them afterward to ``[batch_size, sequence_length, config.hidden_size]`` with
-``n = sequence_length``, which trades increased computation time against reduced memory use, but yields a
-mathematically **equivalent** result.
+individually and concat them afterward to ``[batch_size, sequence_length, config.hidden_size]`` with ``n =
+sequence_length``, which trades increased computation time against reduced memory use, but yields a mathematically
+**equivalent** result.

 For models employing the function :func:`~.transformers.apply_chunking_to_forward`, the ``chunk_size`` defines the
 number of output embeddings that are computed in parallel and thus defines the trade-off between memory and time
-complexity.  If ``chunk_size`` is set to 0, no feed forward chunking is done.
+complexity. If ``chunk_size`` is set to 0, no feed forward chunking is done.
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -22,6 +22,18 @@ State-of-the-art NLP for everyone:
 - Hands-on practitioners
 - AI/ML/NLP teachers and educators

+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Lower compute costs, smaller carbon footprint:

 - Researchers can share trained models instead of always retraining
@@ -35,6 +47,16 @@ Choose the right framework for every part of a model's lifetime:
 - Move a single model between TF2.0/PyTorch frameworks at will
 - Seamlessly pick the right framework for training, evaluation, production

+Experimental support for Flax with a few models right now, expected to grow in the coming months.
+
+`All the model checkpoints <https://huggingface.co/models>`__ are seamlessly integrated from the huggingface.co `model
+hub <https://huggingface.co>`__ where they are uploaded directly by `users <https://huggingface.co/users>`__ and
+`organizations <https://huggingface.co/organizations>`__.
+
+Current number of checkpoints: |checkpoints|
+
+.. |checkpoints| image:: https://img.shields.io/endpoint?url=https://huggingface.co/api/shields/models&color=brightgreen
+
 Contents
 -----------------------------------------------------------------------------------------------------------------------

@@ -44,107 +66,237 @@ The documentation is organized in five parts:
  and a glossary.
 - **USING 🤗 TRANSFORMERS** contains general tutorials on how to use the library.
 - **ADVANCED GUIDES** contains more advanced guides that are more specific to a given script or part of the library.
- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general resarch in
+- **RESEARCH** focuses on tutorials that have less to do with how to use the library but more about general research in
  transformers model
 - The three last section contain the documentation of each public class and function, grouped in:
+
    - **MAIN CLASSES** for the main classes exposing the important APIs of the library.
    - **MODELS** for the classes and functions related to each model implemented in the library.
    - **INTERNAL HELPERS** for the classes and functions we use internally.

-The library currently contains PyTorch and Tensorflow implementations, pre-trained model weights, usage scripts and
-conversion utilities for the following models:
+The library currently contains PyTorch, Tensorflow and Flax implementations, pretrained model weights, usage scripts
+and conversion utilities for the following models:

-1. `BERT <https://github.com/google-research/bert>`_ (from Google) released with the paper `BERT: Pre-training of Deep
-   Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`_ by Jacob Devlin, Ming-Wei
-   Chang, Kenton Lee, and Kristina Toutanova.
-2. `GPT <https://github.com/openai/finetune-transformer-lm>`_ (from OpenAI) released with the paper `Improving Language
-   Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised>`_ by Alec Radford, Karthik
-   Narasimhan, Tim Salimans, and Ilya Sutskever.
-3. `GPT-2 <https://blog.openai.com/better-language-models>`_ (from OpenAI) released with the paper `Language Models are
-   Unsupervised Multitask Learners <https://blog.openai.com/better-language-models>`_ by Alec Radford, Jeffrey Wu,
-   Rewon Child, David Luan, Dario Amodei, and Ilya Sutskever.
-4. `Transformer-XL <https://github.com/kimiyoung/transformer-xl>`_ (from Google/CMU) released with the paper
-   `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_ by
-   Zihang Dai, Zhilin Yang, Yiming Yang, Jaime Carbonell, Quoc V. Le, and Ruslan Salakhutdinov.
-5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `XLNet: Generalized
-   Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang, Zihang
-   Dai, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, and Quoc V. Le.
-6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual
-   Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
-7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with
-   the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle
-   Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, and Veselin
-   Stoyanov.
-8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together
-   with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter
-   <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut, and Thomas Wolf. The same method has been
-   applied to compress GPT2 into
-   `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
-9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the
-   paper `CTRL: A Conditional Transformer Language Model for Controllable Generation
-   <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar, Bryan McCann, Lav R. Varshney, Caiming Xiong,
-   and Richard Socher.
-10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université)
-    released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by
-    Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la
-    Clergerie, Djame Seddah, and Benoît Sagot.
-11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper
-    `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_
-    by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, and Radu Soricut.
-12. `T5 <https://github.com/google-research/text-to-text-transfer-transformer>`_ (from Google) released with the paper
-    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
-    <https://arxiv.org/abs/1910.10683>`_ by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang,
-    Michael Matena, Yanqi Zhou, Wei Li, and Peter J. Liu.
-13. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together
-    with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by
-    Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard
-    Grave, Myle Ott, Luke Zettlemoyer, and Veselin Stoyanov.
-14. `MMBT <https://github.com/facebookresearch/mmbt/>`_ (from Facebook), released together with the paper a `Supervised
-    Multimodal Bitransformers for Classifying Images and Text <https://arxiv.org/pdf/1909.02950.pdf>`_ by Douwe Kiela,
-    Suvrat Bhooshan, Hamed Firooz, and Davide Testuggine.
-15. `FlauBERT <https://github.com/getalp/Flaubert>`_ (from CNRS) released with the paper `FlauBERT: Unsupervised
-    Language Model Pre-training for French <https://arxiv.org/abs/1912.05372>`_ by Hang Le, Loïc Vial, Jibril Frej,
-    Vincent Segonne, Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, and
-    Didier Schwab.
-16. `BART <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_ (from Facebook) released with the paper
-    `BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation, Translation, and Comprehension
-    <https://arxiv.org/pdf/1910.13461.pdf>`_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman
-    Mohamed, Omer Levy, Ves Stoyanov, and Luke Zettlemoyer.
-17. `ELECTRA <https://github.com/google-research/electra>`_ (from Google Research/Stanford University) released with
-    the paper `ELECTRA: Pre-training text encoders as discriminators rather than generators
-    <https://arxiv.org/abs/2003.10555>`_ by Kevin Clark, Minh-Thang Luong, Quoc V. Le, and Christopher D. Manning.
-18. `DialoGPT <https://github.com/microsoft/DialoGPT>`_ (from Microsoft Research) released with the paper `DialoGPT:
-    Large-Scale Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`_ by
-    Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu,
-    and Bill Dolan.
-19. `Reformer <https://github.com/google/trax/tree/master/trax/models/reformer>`_ (from Google Research) released with
-    the paper `Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_ by Nikita Kitaev, Łukasz
-    Kaiser, and Anselm Levskaya.
-20. `MarianMT <https://marian-nmt.github.io/>`_ (developed by the Microsoft Translator Team) machine translation models
-    trained using `OPUS <http://opus.nlpl.eu/>`_ pretrained_models data by Jörg Tiedemann.
-21. `Longformer <https://github.com/allenai/longformer>`_ (from AllenAI) released with the paper `Longformer: The
-    Long-Document Transformer <https://arxiv.org/abs/2004.05150>`_ by Iz Beltagy, Matthew E. Peters, and Arman Cohan.
-22. `DPR <https://github.com/facebookresearch/DPR>`_ (from Facebook) released with the paper `Dense Passage Retrieval
-    for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`_ by Vladimir Karpukhin, Barlas Oğuz, Sewon
-    Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
-23. `Pegasus <https://github.com/google-research/pegasus>`_ (from Google) released with the paper `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization
-    <https://arxiv.org/abs/1912.08777>`_ by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu.
-24. `MBart <https://github.com/pytorch/fairseq/tree/master/examples/mbart>`_ (from Facebook) released with the paper  `Multilingual Denoising Pre-training for Neural Machine Translation <https://arxiv.org/abs/2001.08210>`_ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov,
-    Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
-25. `LXMERT <https://github.com/airsplay/lxmert>`_ (from UNC Chapel Hill) released with the paper `LXMERT: Learning
-    Cross-Modality Encoder Representations from Transformers for Open-Domain Question
-    Answering <https://arxiv.org/abs/1908.07490>`_ by Hao Tan and Mohit Bansal.
-26. `Funnel Transformer <https://github.com/laiguokun/Funnel-Transformer>`_ (from CMU/Google Brain) released with the paper
-    `Funnel-Transformer: Filtering out Sequential Redundancy for Efficient Language Processing
-    <https://arxiv.org/abs/2006.03236>`_ by Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
-27. `Bert For Sequence Generation <https://tfhub.dev/s?module-type=text-generation&subtype=module,placeholder>`_ (from Google) released with the paper
-    `Leveraging Pre-trained Checkpoints for Sequence Generation Tasks
-    <https://arxiv.org/abs/1907.12461>`_ by Sascha Rothe, Shashi Narayan, Aliaksei Severyn.
-28. `LayoutLM <https://github.com/microsoft/unilm/tree/master/layoutlm>`_ (from Microsoft Research Asia) released with the paper
-    `LayoutLM: Pre-training of Text and Layout for Document Image Understanding
-    <https://arxiv.org/abs/1912.13318>`_ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
-29. `Other community models <https://huggingface.co/models>`_, contributed by the `community
-    <https://huggingface.co/users>`_.
+..
+    This list is updated automatically from the README with `make fix-copies`. Do not update manually!
+
+1. :doc:`ALBERT <model_doc/albert>` (from Google Research and the Toyota Technological Institute at Chicago) released
+   with the paper `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations
+   <https://arxiv.org/abs/1909.11942>`__, by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush
+   Sharma, Radu Soricut.
+2. :doc:`BART <model_doc/bart>` (from Facebook) released with the paper `BART: Denoising Sequence-to-Sequence
+   Pre-training for Natural Language Generation, Translation, and Comprehension
+   <https://arxiv.org/pdf/1910.13461.pdf>`__ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman
+   Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer.
+3. :doc:`BARThez <model_doc/barthez>` (from École polytechnique) released with the paper `BARThez: a Skilled Pretrained
+   French Sequence-to-Sequence Model <https://arxiv.org/abs/2010.12321>`__ by Moussa Kamal Eddine, Antoine J.-P.
+   Tixier, Michalis Vazirgiannis.
+4. :doc:`BERT <model_doc/bert>` (from Google) released with the paper `BERT: Pre-training of Deep Bidirectional
+   Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__ by Jacob Devlin, Ming-Wei Chang,
+   Kenton Lee and Kristina Toutanova.
+5. :doc:`BERT For Sequence Generation <model_doc/bertgeneration>` (from Google) released with the paper `Leveraging
+   Pre-trained Checkpoints for Sequence Generation Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi
+   Narayan, Aliaksei Severyn.
+6. :doc:`Blenderbot <model_doc/blenderbot>` (from Facebook) released with the paper `Recipes for building an
+   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
+   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+7. :doc:`BlenderbotSmall <model_doc/blenderbot_small>` (from Facebook) released with the paper `Recipes for building an
+   open-domain chatbot <https://arxiv.org/abs/2004.13637>`__ by Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary
+   Williamson, Yinhan Liu, Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston.
+8. :doc:`CamemBERT <model_doc/camembert>` (from Inria/Facebook/Sorbonne) released with the paper `CamemBERT: a Tasty
+   French Language Model <https://arxiv.org/abs/1911.03894>`__ by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz
+   Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
+9. :doc:`CTRL <model_doc/ctrl>` (from Salesforce) released with the paper `CTRL: A Conditional Transformer Language
+   Model for Controllable Generation <https://arxiv.org/abs/1909.05858>`__ by Nitish Shirish Keskar*, Bryan McCann*,
+   Lav R. Varshney, Caiming Xiong and Richard Socher.
+10. :doc:`DeBERTa <model_doc/deberta>` (from Microsoft Research) released with the paper `DeBERTa: Decoding-enhanced
+    BERT with Disentangled Attention <https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao,
+    Weizhu Chen.
+11. :doc:`DialoGPT <model_doc/dialogpt>` (from Microsoft Research) released with the paper `DialoGPT: Large-Scale
+    Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`__ by Yizhe
+    Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
+12. :doc:`DistilBERT <model_doc/distilbert>` (from HuggingFace), released together with the paper `DistilBERT, a
+    distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__ by Victor
+    Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2
+    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, RoBERTa into `DistilRoBERTa
+    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__, Multilingual BERT into
+    `DistilmBERT <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German
+    version of DistilBERT.
+13. :doc:`DPR <model_doc/dpr>` (from Facebook) released with the paper `Dense Passage Retrieval for Open-Domain
+    Question Answering <https://arxiv.org/abs/2004.04906>`__ by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick
+    Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, and Wen-tau Yih.
+14. :doc:`ELECTRA <model_doc/electra>` (from Google Research/Stanford University) released with the paper `ELECTRA:
+    Pre-training text encoders as discriminators rather than generators <https://arxiv.org/abs/2003.10555>`__ by Kevin
+    Clark, Minh-Thang Luong, Quoc V. Le, Christopher D. Manning.
+15. :doc:`FlauBERT <model_doc/flaubert>` (from CNRS) released with the paper `FlauBERT: Unsupervised Language Model
+    Pre-training for French <https://arxiv.org/abs/1912.05372>`__ by Hang Le, Loïc Vial, Jibril Frej, Vincent Segonne,
+    Maximin Coavoux, Benjamin Lecouteux, Alexandre Allauzen, Benoît Crabbé, Laurent Besacier, Didier Schwab.
+16. :doc:`Funnel Transformer <model_doc/funnel>` (from CMU/Google Brain) released with the paper `Funnel-Transformer:
+    Filtering out Sequential Redundancy for Efficient Language Processing <https://arxiv.org/abs/2006.03236>`__ by
+    Zihang Dai, Guokun Lai, Yiming Yang, Quoc V. Le.
+17. :doc:`GPT <model_doc/gpt>` (from OpenAI) released with the paper `Improving Language Understanding by Generative
+    Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans
+    and Ilya Sutskever.
+18. :doc:`GPT-2 <model_doc/gpt2>` (from OpenAI) released with the paper `Language Models are Unsupervised Multitask
+    Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford*, Jeffrey Wu*, Rewon Child, David
+    Luan, Dario Amodei** and Ilya Sutskever**.
+19. :doc:`LayoutLM <model_doc/layoutlm>` (from Microsoft Research Asia) released with the paper `LayoutLM: Pre-training
+    of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li,
+    Lei Cui, Shaohan Huang, Furu Wei, Ming Zhou.
+20. :doc:`LED <model_doc/led>` (from AllenAI) released with the paper `Longformer: The Long-Document Transformer
+    <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+21. :doc:`Longformer <model_doc/longformer>` (from AllenAI) released with the paper `Longformer: The Long-Document
+    Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz Beltagy, Matthew E. Peters, Arman Cohan.
+22. :doc:`LXMERT <model_doc/lxmert>` (from UNC Chapel Hill) released with the paper `LXMERT: Learning Cross-Modality
+    Encoder Representations from Transformers for Open-Domain Question Answering <https://arxiv.org/abs/1908.07490>`__
+    by Hao Tan and Mohit Bansal.
+23. :doc:`MarianMT <model_doc/marian>` Machine translation models trained using `OPUS <http://opus.nlpl.eu/>`__ data by
+    Jörg Tiedemann. The `Marian Framework <https://marian-nmt.github.io/>`__ is being developed by the Microsoft
+    Translator Team.
+24. :doc:`MBart <model_doc/mbart>` (from Facebook) released with the paper `Multilingual Denoising Pre-training for
+    Neural Machine Translation <https://arxiv.org/abs/2001.08210>`__ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li,
+    Sergey Edunov, Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+25. :doc:`MPNet <model_doc/mpnet>` (from Microsoft Research) released with the paper `MPNet: Masked and Permuted
+    Pre-training for Language Understanding <https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin,
+    Jianfeng Lu, Tie-Yan Liu.
+26. :doc:`MT5 <model_doc/mt5>` (from Google AI) released with the paper `mT5: A massively multilingual pre-trained
+    text-to-text transformer <https://arxiv.org/abs/2010.11934>`__ by Linting Xue, Noah Constant, Adam Roberts, Mihir
+    Kale, Rami Al-Rfou, Aditya Siddhant, Aditya Barua, Colin Raffel.
+27. :doc:`Pegasus <model_doc/pegasus>` (from Google) released with the paper `PEGASUS: Pre-training with Extracted
+    Gap-sentences for Abstractive Summarization <https://arxiv.org/abs/1912.08777>`__> by Jingqing Zhang, Yao Zhao,
+    Mohammad Saleh and Peter J. Liu.
+28. :doc:`ProphetNet <model_doc/prophetnet>` (from Microsoft Research) released with the paper `ProphetNet: Predicting
+    Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi,
+    Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+29. :doc:`Reformer <model_doc/reformer>` (from Google Research) released with the paper `Reformer: The Efficient
+    Transformer <https://arxiv.org/abs/2001.04451>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.
+30. :doc:`RoBERTa <model_doc/roberta>` (from Facebook), released together with the paper a `Robustly Optimized BERT
+    Pretraining Approach <https://arxiv.org/abs/1907.11692>`__ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar
+    Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. ultilingual BERT into `DistilmBERT
+    <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__ and a German version of
+    DistilBERT.
+31. :doc:`SqueezeBert <model_doc/squeezebert>` released with the paper `SqueezeBERT: What can computer vision teach NLP
+    about efficient neural networks? <https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi
+    Krishna, and Kurt W. Keutzer.
+32. :doc:`T5 <model_doc/t5>` (from Google AI) released with the paper `Exploring the Limits of Transfer Learning with a
+    Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`__ by Colin Raffel and Noam Shazeer and Adam
+    Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+33. :doc:`TAPAS <model_doc/tapas>` (from Google AI) released with the paper `TAPAS: Weakly Supervised Table Parsing via
+    Pre-training <https://arxiv.org/abs/2004.02349>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
+    Francesco Piccinno and Julian Martin Eisenschlos.
+34. :doc:`Transformer-XL <model_doc/transformerxl>` (from Google/CMU) released with the paper `Transformer-XL:
+    Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`__ by Zihang Dai*,
+    Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+35. :doc:`XLM <model_doc/xlm>` (from Facebook) released together with the paper `Cross-lingual Language Model
+    Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
+36. :doc:`XLM-ProphetNet <model_doc/xlmprophetnet>` (from Microsoft Research) released with the paper `ProphetNet:
+    Predicting Future N-gram for Sequence-to-Sequence Pre-training <https://arxiv.org/abs/2001.04063>`__ by Yu Yan,
+    Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang and Ming Zhou.
+37. :doc:`XLM-RoBERTa <model_doc/xlmroberta>` (from Facebook AI), released together with the paper `Unsupervised
+    Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`__ by Alexis Conneau*, Kartikay
+    Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke
+    Zettlemoyer and Veselin Stoyanov.
+38. :doc:`XLNet <model_doc/xlnet>` (from Google/CMU) released with the paper `XLNet: Generalized Autoregressive
+    Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang*, Zihang Dai*, Yiming
+    Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
+
+
+.. _bigtable:
+
+The table below represents the current support in the library for each of those models, whether they have a Python
+tokenizer (called "slow"). A "fast" tokenizer backed by the 🤗 Tokenizers library, whether they have support in PyTorch,
+TensorFlow and/or Flax.
+
+..
+    This table is updated automatically from the auto modules with `make fix-copies`. Do not update manually!
+
+.. rst-class:: center-aligned-table
+
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            Model            | Tokenizer slow | Tokenizer fast | PyTorch support | TensorFlow support | Flax Support |
+=============================+================+================+=================+====================+==============+
+|           ALBERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            BART             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            BERT             |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       Bert Generation       |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         Blenderbot          |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       BlenderbotSmall       |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            CTRL             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          CamemBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             DPR             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           DeBERTa           |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         DistilBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           ELECTRA           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       Encoder decoder       |       ❌       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+| FairSeq Machine-Translation |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          FlauBERT           |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|     Funnel Transformer      |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             LED             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           LXMERT            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          LayoutLM           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         Longformer          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            MPNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           Marian            |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         MobileBERT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         OpenAI GPT          |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        OpenAI GPT-2         |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           Pegasus           |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         ProphetNet          |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             RAG             |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          Reformer           |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|          RetriBERT          |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|           RoBERTa           |       ✅       |       ✅       |       ✅        |         ✅         |      ✅      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         SqueezeBERT         |       ✅       |       ✅       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             T5              |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            TAPAS            |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|       Transformer-XL        |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             XLM             |       ✅       |       ❌       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|         XLM-RoBERTa         |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|        XLMProphetNet        |       ✅       |       ❌       |       ✅        |         ❌         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            XLNet            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|            mBART            |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
+|             mT5             |       ✅       |       ✅       |       ✅        |         ✅         |      ❌      |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+

 .. toctree::
    :maxdepth: 2
@@ -193,6 +345,7 @@ conversion utilities for the following models:
    :maxdepth: 2
    :caption: Main Classes

+    main_classes/callback
    main_classes/configuration
    main_classes/logging
    main_classes/model
@@ -210,10 +363,15 @@ conversion utilities for the following models:
    model_doc/albert
    model_doc/auto
    model_doc/bart
+    model_doc/barthez
    model_doc/bert
+    model_doc/bertweet
    model_doc/bertgeneration
+    model_doc/blenderbot
+    model_doc/blenderbot_small
    model_doc/camembert
    model_doc/ctrl
+    model_doc/deberta
    model_doc/dialogpt
    model_doc/distilbert
    model_doc/dpr
@@ -222,22 +380,31 @@ conversion utilities for the following models:
    model_doc/flaubert
    model_doc/fsmt
    model_doc/funnel
+    model_doc/herbert
    model_doc/layoutlm
+    model_doc/led
    model_doc/longformer
    model_doc/lxmert
    model_doc/marian
    model_doc/mbart
    model_doc/mobilebert
+    model_doc/mpnet
+    model_doc/mt5
    model_doc/gpt
    model_doc/gpt2
    model_doc/pegasus
+    model_doc/phobert
+    model_doc/prophetnet
    model_doc/rag
    model_doc/reformer
    model_doc/retribert
    model_doc/roberta
+    model_doc/squeezebert
    model_doc/t5
+    model_doc/tapas
    model_doc/transformerxl
    model_doc/xlm
+    model_doc/xlmprophetnet
    model_doc/xlmroberta
    model_doc/xlnet

@@ -248,3 +415,5 @@ conversion utilities for the following models:
    internal/modeling_utils
    internal/pipelines_utils
    internal/tokenization_utils
+    internal/trainer_utils
+    internal/generation_utils
--- a/docs/source/installation.md
+++ b/docs/source/installation.md
@@ -1,3 +1,19 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 # Installation

 🤗 Transformers is tested on Python 3.6+, and PyTorch 1.1.0+ or TensorFlow 2.0+.
@@ -12,9 +28,10 @@ must install it from source.
 ## Installation with pip

 First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
-Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) 
-and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific 
-install command for your platform.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available), 
+[PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) and/or 
+[Flax installation page](https://github.com/google/flax#quick-install)
+regarding the specific install command for your platform.

 When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:

@@ -34,16 +51,22 @@ or 🤗 Transformers and TensorFlow 2.0 in one line with:
 pip install transformers[tf-cpu]
 ```

+or 🤗 Transformers and Flax in one line with:
+
+```bash
+pip install transformers[flax]
+```
+
 To check 🤗 Transformers is properly installed, run the following command:

 ```bash
-python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('I hate you'))"
+python -c "from transformers import pipeline; print(pipeline('sentiment-analysis')('we love you'))"
 ```

 It should download a pretrained model then print something like

 ```bash
-[{'label': 'NEGATIVE', 'score': 0.9991129040718079}]
+[{'label': 'POSITIVE', 'score': 0.9998704791069031}]
 ```

 (Note that TensorFlow will print additional stuff before that last statement.)
@@ -66,23 +89,36 @@ python -c "from transformers import pipeline; print(pipeline('sentiment-analysis

 to check 🤗 Transformers is properly installed.

+
+## With conda
+
+Since Transformers version v4.0.0, we now have a conda channel: `huggingface`.
+
+🤗 Transformers can be installed using conda as follows:
+
+```
+conda install -c huggingface transformers
+```
+
+Follow the installation pages of TensorFlow, PyTorch or Flax to see how to install them with conda. 
+
 ## Caching models

 This library provides pretrained models that will be downloaded and cached locally. Unless you specify a location with
 `cache_dir=...` when you use methods like `from_pretrained`, these models will automatically be downloaded in the
-folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the PyTorch
-cache home followed by ``/transformers/`` (even if you don't have PyTorch installed). This is (by order of priority):
+folder given by the shell environment variable ``TRANSFORMERS_CACHE``. The default value for it will be the Hugging
+Face cache home followed by ``/transformers/``. This is (by order of priority):

-  * shell environment variable ``TORCH_HOME``
-  * shell environment variable ``XDG_CACHE_HOME`` + ``/torch/``
-  * default: ``~/.cache/torch/``
+  * shell environment variable ``HF_HOME`` 
+  * shell environment variable ``XDG_CACHE_HOME`` + ``/huggingface/``
+  * default: ``~/.cache/huggingface/``

 So if you don't have any specific environment variable set, the cache directory will be at
-``~/.cache/torch/transformers/``.
+``~/.cache/huggingface/transformers/``.

-**Note:** If you have set a shell enviromnent variable for one of the predecessors of this library
+**Note:** If you have set a shell environment variable for one of the predecessors of this library
 (``PYTORCH_TRANSFORMERS_CACHE`` or ``PYTORCH_PRETRAINED_BERT_CACHE``), those will be used if there is no shell
-enviromnent variable for ``TRANSFORMERS_CACHE``.
+environment variable for ``TRANSFORMERS_CACHE``.

 ### Note on model downloads (Continuous Integration or large-scale deployments)

@@ -97,6 +133,6 @@ You should check out our [swift-coreml-transformers](https://github.com/huggingf
 It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, 
 `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.

-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch or
+At some point in the future, you'll be able to seamlessly move from pretraining or fine-tuning models in PyTorch or
 TensorFlow 2.0 to productizing them in CoreML, or prototype a model or an app in CoreML then research its
 hyperparameters or architecture from PyTorch or TensorFlow 2.0. Super exciting!
--- a/docs/source/internal/generation_utils.rst
+++ b/docs/source/internal/generation_utils.rst
@@ -0,0 +1,168 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Utilities for Generation
+-----------------------------------------------------------------------------------------------------------------------
+
+This page lists all the utility functions used by :meth:`~transformers.PreTrainedModel.generate`,
+:meth:`~transformers.PreTrainedModel.greedy_search`, :meth:`~transformers.PreTrainedModel.sample`,
+:meth:`~transformers.PreTrainedModel.beam_search`, :meth:`~transformers.PreTrainedModel.beam_sample`, and
+:meth:`~transformers.PreTrainedModel.group_beam_search`.
+
+Most of those are only useful if you are studying the code of the generate methods in the library.
+
+Generate Outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The output of :meth:`~transformers.PreTrainedModel.generate` is an instance of a subclass of
+:class:`~transformers.file_utils.ModelOutput`. This output is a data structure containing all the information returned
+by :meth:`~transformers.PreTrainedModel.generate`, but that can also be used as tuple or dictionary.
+
+Here's an example:
+
+.. code-block::
+
+    from transformers import GPT2Tokenizer, GPT2LMHeadModel
+
+    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+    model = GPT2LMHeadModel.from_pretrained('gpt2')
+
+    inputs = tokenizer("Hello, my dog is cute and ", return_tensors="pt")
+    generation_output = model.generate(**inputs, return_dict_in_generate=True, output_scores=True)
+
+The ``generation_output`` object is a :class:`~transformers.generation_utils.GreedySearchDecoderOnlyOutput`, as we can
+see in the documentation of that class below, it means it has the following attributes:
+
+- ``sequences``: the generated sequences of tokens
+- ``scores`` (optional): the prediction scores of the language modelling head, for each generation step
+- ``hidden_states`` (optional): the hidden states of the model, for each generation step
+- ``attentions`` (optional): the attention weights of the model, for each generation step
+
+Here we have the ``scores`` since we passed along ``output_scores=True``, but we don't have ``hidden_states`` and
+``attentions`` because we didn't pass ``output_hidden_states=True`` or ``output_attentions=True``.
+
+You can access each attribute as you would usually do, and if that attribute has not been returned by the model, you
+will get ``None``. Here for instance ``generation_output.scores`` are all the generated prediction scores of the
+language modeling head, and ``generation_output.attentions`` is ``None``.
+
+When using our ``generation_output`` object as a tuple, it only keeps the attributes that don't have ``None`` values.
+Here, for instance, it has two elements, ``loss`` then ``logits``, so
+
+.. code-block::
+
+    generation_output[:2]
+
+will return the tuple ``(generation_output.sequences, generation_output.scores)`` for instance.
+
+When using our ``generation_output`` object as a dictionary, it only keeps the attributes that don't have ``None``
+values. Here, for instance, it has two keys that are ``sequences`` and ``scores``.
+
+We document here all output types.
+
+
+GreedySearchOutput
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: transformers.generation_utils.GreedySearchDecoderOnlyOutput
+    :members:
+
+.. autoclass:: transformers.generation_utils.GreedySearchEncoderDecoderOutput
+    :members:
+
+
+SampleOutput
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: transformers.generation_utils.SampleDecoderOnlyOutput
+    :members:
+
+.. autoclass:: transformers.generation_utils.SampleEncoderDecoderOutput
+    :members:
+
+
+BeamSearchOutput
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: transformers.generation_utils.BeamSearchDecoderOnlyOutput
+    :members:
+
+.. autoclass:: transformers.generation_utils.BeamSearchEncoderDecoderOutput
+    :members:
+
+
+BeamSampleOutput
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: transformers.generation_utils.BeamSampleDecoderOnlyOutput
+    :members:
+
+.. autoclass:: transformers.generation_utils.BeamSampleEncoderDecoderOutput
+    :members:
+
+
+LogitsProcessor
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+A :class:`~transformers.LogitsProcessor` can be used to modify the prediction scores of a language model head for
+generation.
+
+.. autoclass:: transformers.LogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.LogitsProcessorList
+    :members: __call__
+
+.. autoclass:: transformers.LogitsWarper
+    :members: __call__
+
+.. autoclass:: transformers.MinLengthLogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.TemperatureLogitsWarper
+    :members: __call__
+
+.. autoclass:: transformers.RepetitionPenaltyLogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.TopPLogitsWarper
+    :members: __call__
+
+.. autoclass:: transformers.TopKLogitsWarper
+    :members: __call__
+
+.. autoclass:: transformers.NoRepeatNGramLogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.NoBadWordsLogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.PrefixConstrainedLogitsProcessor
+    :members: __call__
+
+.. autoclass:: transformers.HammingDiversityLogitsProcessor
+    :members: __call__
+
+BeamSearch
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BeamScorer
+    :members: process, finalize
+
+.. autoclass:: transformers.BeamSearchScorer
+    :members: process, finalize
+
+Utilities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: transformers.top_k_top_p_filtering
+
+.. autofunction:: transformers.tf_top_k_top_p_filtering
--- a/docs/source/internal/modeling_utils.rst
+++ b/docs/source/internal/modeling_utils.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Custom Layers and Utilities
 -----------------------------------------------------------------------------------------------------------------------

@@ -79,10 +91,8 @@ TensorFlow loss functions
 TensorFlow Helper Functions
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autofunction:: transformers.modeling_tf_utils.cast_bool_to_primitive
-
 .. autofunction:: transformers.modeling_tf_utils.get_initializer

 .. autofunction:: transformers.modeling_tf_utils.keras_serializable

-.. autofunction:: transformers.modeling_tf_utils.shape_list
+.. autofunction:: transformers.modeling_tf_utils.shape_list
--- a/docs/source/internal/pipelines_utils.rst
+++ b/docs/source/internal/pipelines_utils.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Utilities for pipelines
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/internal/tokenization_utils.rst
+++ b/docs/source/internal/tokenization_utils.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Utilities for Tokenizers
 -----------------------------------------------------------------------------------------------------------------------

@@ -25,6 +37,7 @@ SpecialTokensMixin

 Enums and namedtuples
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 .. autoclass:: transformers.tokenization_utils_base.ExplicitEnum

 .. autoclass:: transformers.tokenization_utils_base.PaddingStrategy
--- a/docs/source/internal/trainer_utils.rst
+++ b/docs/source/internal/trainer_utils.rst
@@ -0,0 +1,48 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Utilities for Trainer
+-----------------------------------------------------------------------------------------------------------------------
+
+This page lists all the utility functions used by :class:`~transformers.Trainer`.
+
+Most of those are only useful if you are studying the code of the Trainer in the library.
+
+Utilities
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.EvalPrediction
+
+.. autoclass:: transformers.EvaluationStrategy
+
+.. autofunction:: transformers.set_seed
+
+.. autofunction:: transformers.torch_distributed_zero_first
+
+
+Callbacks internals
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.trainer_callback.CallbackHandler
+
+
+Distributed Evaluation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.trainer_pt_utils.DistributedTensorGatherer
+    :members:
+
+
+Distributed Evaluation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.HfArgumentParser
--- a/docs/source/main_classes/callback.rst
+++ b/docs/source/main_classes/callback.rst
@@ -0,0 +1,89 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Callbacks
+-----------------------------------------------------------------------------------------------------------------------
+
+Callbacks are objects that can customize the behavior of the training loop in the PyTorch
+:class:`~transformers.Trainer` (this feature is not yet implemented in TensorFlow) that can inspect the training loop
+state (for progress reporting, logging on TensorBoard or other ML platforms...) and take decisions (like early
+stopping).
+
+Callbacks are "read only" pieces of code, apart from the :class:`~transformers.TrainerControl` object they return, they
+cannot change anything in the training loop. For customizations that require changes in the training loop, you should
+subclass :class:`~transformers.Trainer` and override the methods you need (see :doc:`trainer` for examples).
+
+By default a :class:`~transformers.Trainer` will use the following callbacks:
+
+- :class:`~transformers.DefaultFlowCallback` which handles the default behavior for logging, saving and evaluation.
+- :class:`~transformers.PrinterCallback` or :class:`~transformers.ProgressCallback` to display progress and print the
+  logs (the first one is used if you deactivate tqdm through the :class:`~transformers.TrainingArguments`, otherwise
+  it's the second one).
+- :class:`~transformers.integrations.TensorBoardCallback` if tensorboard is accessible (either through PyTorch >= 1.4
+  or tensorboardX).
+- :class:`~transformers.integrations.WandbCallback` if `wandb <https://www.wandb.com/>`__ is installed.
+- :class:`~transformers.integrations.CometCallback` if `comet_ml <https://www.comet.ml/site/>`__ is installed.
+- :class:`~transformers.integrations.MLflowCallback` if `mlflow <https://www.mlflow.org/>`__ is installed.
+- :class:`~transformers.integrations.AzureMLCallback` if `azureml-sdk <https://pypi.org/project/azureml-sdk/>`__ is
+  installed.
+
+The main class that implements callbacks is :class:`~transformers.TrainerCallback`. It gets the
+:class:`~transformers.TrainingArguments` used to instantiate the :class:`~transformers.Trainer`, can access that
+Trainer's internal state via :class:`~transformers.TrainerState`, and can take some actions on the training loop via
+:class:`~transformers.TrainerControl`.
+
+
+Available Callbacks
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here is the list of the available :class:`~transformers.TrainerCallback` in the library:
+
+.. autoclass:: transformers.integrations.CometCallback
+    :members: setup
+
+.. autoclass:: transformers.DefaultFlowCallback
+
+.. autoclass:: transformers.PrinterCallback
+
+.. autoclass:: transformers.ProgressCallback
+
+.. autoclass:: transformers.EarlyStoppingCallback
+
+.. autoclass:: transformers.integrations.TensorBoardCallback
+
+.. autoclass:: transformers.integrations.WandbCallback
+    :members: setup
+
+.. autoclass:: transformers.integrations.MLflowCallback
+    :members: setup
+
+.. autoclass:: transformers.integrations.AzureMLCallback
+
+TrainerCallback
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrainerCallback
+    :members:
+
+
+TrainerState
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrainerState
+    :members:
+
+
+TrainerControl
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TrainerControl
+    :members:
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Configuration
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/main_classes/logging.rst
+++ b/docs/source/main_classes/logging.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Logging
 -----------------------------------------------------------------------------------------------------------------------

@@ -17,7 +29,7 @@ You can also use the environment variable ``TRANSFORMERS_VERBOSITY`` to override
 to one of the following: ``debug``, ``info``, ``warning``, ``error``, ``critical``. For example:

 .. code-block:: bash
-               
+
    TRANSFORMERS_VERBOSITY=error ./myprogram.py

 All the methods of this logging module are documented below, the main ones are
@@ -55,4 +67,4 @@ Other functions

 .. autofunction:: transformers.logging.enable_explicit_format

-.. autofunction:: transformers.logging.reset_format
+.. autofunction:: transformers.logging.reset_format
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -1,9 +1,22 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Models
 -----------------------------------------------------------------------------------------------------------------------

-The base classes :class:`~transformers.PreTrainedModel` and :class:`~transformers.TFPreTrainedModel` implement the
-common methods for loading/saving a model either from a local file or directory, or from a pretrained model
-configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
+The base classes :class:`~transformers.PreTrainedModel`, :class:`~transformers.TFPreTrainedModel`, and
+:class:`~transformers.FlaxPreTrainedModel` implement the common methods for loading/saving a model either from a local
+file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS
+S3 repository).

 :class:`~transformers.PreTrainedModel` and :class:`~transformers.TFPreTrainedModel` also implement a few methods which
 are common among all the models to:
@@ -45,11 +58,18 @@ TFModelUtilsMixin
    :members:


-Generative models
+FlaxPreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxPreTrainedModel
+    :members:
+
+
+Generation
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.generation_utils.GenerationMixin
    :members:

 .. autoclass:: transformers.generation_tf_utils.TFGenerationMixin
-    :members:
+    :members:
--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Optimization
 -----------------------------------------------------------------------------------------------------------------------

@@ -31,6 +43,10 @@ Schedules
 Learning Rate Schedules (Pytorch)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

+.. autoclass:: transformers.SchedulerType
+
+.. autofunction:: transformers.get_scheduler
+
 .. autofunction:: transformers.get_constant_schedule


@@ -62,6 +78,10 @@ Learning Rate Schedules (Pytorch)
    :target: /imgs/warmup_linear_schedule.png
    :alt:

+
+.. autofunction:: transformers.get_polynomial_decay_schedule_with_warmup
+
+
 Warmup (TensorFlow)
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

--- a/docs/source/main_classes/output.rst
+++ b/docs/source/main_classes/output.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Model outputs
 -----------------------------------------------------------------------------------------------------------------------

@@ -65,12 +77,34 @@ BaseModelOutputWithPooling
    :members:


+BaseModelOutputWithCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithCrossAttentions
+    :members:
+
+
+BaseModelOutputWithPoolingAndCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions
+    :members:
+
+
 BaseModelOutputWithPast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPast
    :members:

+
+BaseModelOutputWithPastAndCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.BaseModelOutputWithPastAndCrossAttentions
+    :members:
+
+
 Seq2SeqModelOutput
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -85,6 +119,13 @@ CausalLMOutput
    :members:


+CausalLMOutputWithCrossAttentions
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_outputs.CausalLMOutputWithCrossAttentions
+    :members:
+
+
 CausalLMOutputWithPast
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/main_classes/pipelines.rst
+++ b/docs/source/main_classes/pipelines.rst
@@ -1,8 +1,20 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Pipelines
 -----------------------------------------------------------------------------------------------------------------------

-The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most
-of the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity
+The pipelines are a great and easy way to use models for inference. These pipelines are objects that abstract most of
+the complex code from the library, offering a simple API dedicated to several tasks, including Named Entity
 Recognition, Masked Language Modeling, Sentiment Analysis, Feature Extraction and Question Answering. See the
 :doc:`task summary <../task_summary>` for examples of use.

@@ -22,12 +34,13 @@ There are two categories of pipeline abstractions to be aware about:
    - :class:`~transformers.TranslationPipeline`
    - :class:`~transformers.ZeroShotClassificationPipeline`
    - :class:`~transformers.Text2TextGenerationPipeline`
+    - :class:`~transformers.TableQuestionAnsweringPipeline`

 The pipeline abstraction
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any
-other pipeline but requires an additional argument which is the `task`.
+The `pipeline` abstraction is a wrapper around all the other available pipelines. It is instantiated as any other
+pipeline but requires an additional argument which is the `task`.

 .. autofunction:: transformers.pipeline

@@ -61,8 +74,9 @@ FillMaskPipeline
 NerPipeline
 =======================================================================================================================

-This class is an alias of the :class:`~transformers.TokenClassificationPipeline` defined below. Please refer to that
-pipeline for documentation and usage examples.
+.. autoclass:: transformers.NerPipeline
+
+See :class:`~transformers.TokenClassificationPipeline` for all details.

 QuestionAnsweringPipeline
 =======================================================================================================================
@@ -78,6 +92,13 @@ SummarizationPipeline
    :special-members: __call__
    :members:

+TableQuestionAnsweringPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.TableQuestionAnsweringPipeline
+    :special-members: __call__
+
+
 TextClassificationPipeline
 =======================================================================================================================

@@ -106,6 +127,13 @@ TokenClassificationPipeline
    :special-members: __call__
    :members:

+TranslationPipeline
+=======================================================================================================================
+
+.. autoclass:: transformers.TranslationPipeline
+    :special-members: __call__
+    :members:
+
 ZeroShotClassificationPipeline
 =======================================================================================================================

--- a/docs/source/main_classes/processors.rst
+++ b/docs/source/main_classes/processors.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Processors
 -----------------------------------------------------------------------------------------------------------------------

@@ -8,8 +20,8 @@ Processors
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 All processors follow the same architecture which is that of the
-:class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list
-of :class:`~transformers.data.processors.utils.InputExample`. These
+:class:`~transformers.data.processors.utils.DataProcessor`. The processor returns a list of
+:class:`~transformers.data.processors.utils.InputExample`. These
 :class:`~transformers.data.processors.utils.InputExample` can be converted to
 :class:`~transformers.data.processors.utils.InputFeatures` in order to be fed to the model.

@@ -28,14 +40,16 @@ of :class:`~transformers.data.processors.utils.InputExample`. These
 GLUE
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-`General Language Understanding Evaluation (GLUE) <https://gluebenchmark.com/>`__ is a benchmark that evaluates
-the performance of models across a diverse set of existing NLU tasks. It was released together with the paper
-`GLUE: A multi-task benchmark and analysis platform for natural language understanding <https://openreview.net/pdf?id=rJ4km2R5t7>`__
+`General Language Understanding Evaluation (GLUE) <https://gluebenchmark.com/>`__ is a benchmark that evaluates the
+performance of models across a diverse set of existing NLU tasks. It was released together with the paper `GLUE: A
+multi-task benchmark and analysis platform for natural language understanding
+<https://openreview.net/pdf?id=rJ4km2R5t7>`__

-This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched),
-CoLA, SST2, STSB, QQP, QNLI, RTE and WNLI.
+This library hosts a total of 10 processors for the following tasks: MRPC, MNLI, MNLI (mismatched), CoLA, SST2, STSB,
+QQP, QNLI, RTE and WNLI.

 Those processors are:
+
    - :class:`~transformers.data.processors.utils.MrpcProcessor`
    - :class:`~transformers.data.processors.utils.MnliProcessor`
    - :class:`~transformers.data.processors.utils.MnliMismatchedProcessor`
@@ -46,7 +60,7 @@ Those processors are:
    - :class:`~transformers.data.processors.utils.RteProcessor`
    - :class:`~transformers.data.processors.utils.WnliProcessor`

-Additionally, the following method  can be used to load values from a data file and convert them to a list of
+Additionally, the following method can be used to load values from a data file and convert them to a list of
 :class:`~transformers.data.processors.utils.InputExample`.

 .. automethod:: transformers.data.processors.glue.glue_convert_examples_to_features
@@ -54,36 +68,39 @@ Additionally, the following method  can be used to load values from a data file
 Example usage
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-An example using these processors is given in the `run_glue.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/text-classification/run_glue.py>`__ script.
+An example using these processors is given in the `run_glue.py
+<https://github.com/huggingface/pytorch-transformers/blob/master/examples/text-classification/run_glue.py>`__ script.


 XNLI
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-`The Cross-Lingual NLI Corpus (XNLI) <https://www.nyu.edu/projects/bowman/xnli/>`__ is a benchmark that evaluates
-the quality of cross-lingual text representations. 
-XNLI is crowd-sourced dataset based on `MultiNLI <http://www.nyu.edu/projects/bowman/multinli/>`: pairs of text are labeled with textual entailment 
-annotations for 15 different languages (including both high-resource language such as English and low-resource languages such as Swahili).
+`The Cross-Lingual NLI Corpus (XNLI) <https://www.nyu.edu/projects/bowman/xnli/>`__ is a benchmark that evaluates the
+quality of cross-lingual text representations. XNLI is crowd-sourced dataset based on `MultiNLI
+<http://www.nyu.edu/projects/bowman/multinli/>`: pairs of text are labeled with textual entailment annotations for 15
+different languages (including both high-resource language such as English and low-resource languages such as Swahili).

-It was released together with the paper
-`XNLI: Evaluating Cross-lingual Sentence Representations <https://arxiv.org/abs/1809.05053>`__
+It was released together with the paper `XNLI: Evaluating Cross-lingual Sentence Representations
+<https://arxiv.org/abs/1809.05053>`__

 This library hosts the processor to load the XNLI data:
+
    - :class:`~transformers.data.processors.utils.XnliProcessor`

 Please note that since the gold labels are available on the test set, evaluation is performed on the test set.

-An example using these processors is given in the
-`run_xnli.py <https://github.com/huggingface/pytorch-transformers/blob/master/examples/text-classification/run_xnli.py>`__ script.
+An example using these processors is given in the `run_xnli.py
+<https://github.com/huggingface/pytorch-transformers/blob/master/examples/text-classification/run_xnli.py>`__ script.


 SQuAD
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-`The Stanford Question Answering Dataset (SQuAD) <https://rajpurkar.github.io/SQuAD-explorer//>`__ is a benchmark that evaluates
-the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version (v1.1) was released together with the paper
-`SQuAD: 100,000+ Questions for Machine Comprehension of Text <https://arxiv.org/abs/1606.05250>`__. The second version (v2.0) was released alongside 
-the paper `Know What You Don't Know: Unanswerable Questions for SQuAD <https://arxiv.org/abs/1806.03822>`__.
+`The Stanford Question Answering Dataset (SQuAD) <https://rajpurkar.github.io/SQuAD-explorer//>`__ is a benchmark that
+evaluates the performance of models on question answering. Two versions are available, v1.1 and v2.0. The first version
+(v1.1) was released together with the paper `SQuAD: 100,000+ Questions for Machine Comprehension of Text
+<https://arxiv.org/abs/1606.05250>`__. The second version (v2.0) was released alongside the paper `Know What You Don't
+Know: Unanswerable Questions for SQuAD <https://arxiv.org/abs/1806.03822>`__.

 This library hosts a processor for each of the two versions:

@@ -91,6 +108,7 @@ Processors
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 Those processors are:
+
    - :class:`~transformers.data.processors.utils.SquadV1Processor`
    - :class:`~transformers.data.processors.utils.SquadV2Processor`

@@ -99,20 +117,21 @@ They both inherit from the abstract class :class:`~transformers.data.processors.
 .. autoclass:: transformers.data.processors.squad.SquadProcessor
    :members:

-Additionally, the following method can be used to convert SQuAD examples into :class:`~transformers.data.processors.utils.SquadFeatures`
-that can be used as model inputs.
+Additionally, the following method can be used to convert SQuAD examples into
+:class:`~transformers.data.processors.utils.SquadFeatures` that can be used as model inputs.

 .. automethod:: transformers.data.processors.squad.squad_convert_examples_to_features

-These processors as well as the aforementionned method can be used with files containing the data as well as with the `tensorflow_datasets` package.
-Examples are given below.
+These processors as well as the aforementionned method can be used with files containing the data as well as with the
+`tensorflow_datasets` package. Examples are given below.


 Example usage
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 Here is an example using the processors as well as the conversion method using data files:

-Example::
+.. code-block::

    # Loading a V2 processor
    processor = SquadV2Processor()
@@ -133,7 +152,7 @@ Example::

 Using `tensorflow_datasets` is as easy as using a data file:

-Example::
+.. code-block::

    # tensorflow_datasets only handle Squad V1.
    tfds_examples = tfds.load("squad")
@@ -149,5 +168,5 @@ Example::
    )


-Another example using these processors is given in the
-`run_squad.py <https://github.com/huggingface/transformers/blob/master/examples/question-answering/run_squad.py>`__ script.
+Another example using these processors is given in the :prefix_link:`run_squad.py
+<examples/question-answering/run_squad.py>` script.
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Tokenizer
 -----------------------------------------------------------------------------------------------------------------------

@@ -29,11 +41,12 @@ methods for using all the tokenizers:

 :class:`~transformers.BatchEncoding` holds the output of the tokenizer's encoding methods (``__call__``,
 ``encode_plus`` and ``batch_encode_plus``) and is derived from a Python dictionary. When the tokenizer is a pure python
-tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by these
-methods (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e., backed by HuggingFace
-`tokenizers library <https://github.com/huggingface/tokenizers>`__), this class provides in addition several advanced
-alignment methods which can be used to map between the original string (character and words) and the token space (e.g.,
-getting the index of the token comprising a given character or the span of characters corresponding to a given token).
+tokenizer, this class behaves just like a standard python dictionary and holds the various model inputs computed by
+these methods (``input_ids``, ``attention_mask``...). When the tokenizer is a "Fast" tokenizer (i.e., backed by
+HuggingFace `tokenizers library <https://github.com/huggingface/tokenizers>`__), this class provides in addition
+several advanced alignment methods which can be used to map between the original string (character and words) and the
+token space (e.g., getting the index of the token comprising a given character or the span of characters corresponding
+to a given token).


 PreTrainedTokenizer
--- a/docs/source/main_classes/trainer.rst
+++ b/docs/source/main_classes/trainer.rst
@@ -1,10 +1,22 @@
+..
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Trainer
 -----------------------------------------------------------------------------------------------------------------------

 The :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` classes provide an API for feature-complete
 training in most standard use cases. It's used in most of the :doc:`example scripts <../examples>`.

-Before instantiating your :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`, create a 
+Before instantiating your :class:`~transformers.Trainer`/:class:`~transformers.TFTrainer`, create a
 :class:`~transformers.TrainingArguments`/:class:`~transformers.TFTrainingArguments` to access all the points of
 customization during training.

@@ -15,10 +27,9 @@ Both :class:`~transformers.Trainer` and :class:`~transformers.TFTrainer` contain
 previous features. To inject custom behavior you can subclass them and override the following methods:

 - **get_train_dataloader**/**get_train_tfdataset** -- Creates the training DataLoader (PyTorch) or TF Dataset.
- **get_eval_dataloader**/**get_eval_tfdataset** -- Creates the evaulation DataLoader (PyTorch) or TF Dataset.
+- **get_eval_dataloader**/**get_eval_tfdataset** -- Creates the evaluation DataLoader (PyTorch) or TF Dataset.
 - **get_test_dataloader**/**get_test_tfdataset** -- Creates the test DataLoader (PyTorch) or TF Dataset.
 - **log** -- Logs information on the various objects watching training.
- **setup_wandb** -- Setups wandb (see `here <https://docs.wandb.com/huggingface>`__ for more information).
 - **create_optimizer_and_scheduler** -- Setups the optimizer and learning rate scheduler if they were not passed at
  init.
 - **compute_loss** - Computes the loss on a batch of training inputs.
@@ -36,10 +47,14 @@ Here is an example of how to customize :class:`~transformers.Trainer` using a cu
    class MyTrainer(Trainer):
        def compute_loss(self, model, inputs):
            labels = inputs.pop("labels")
-            outputs = models(**inputs)
+            outputs = model(**inputs)
            logits = outputs[0]
            return my_custom_loss(logits, labels)

+Another way to customize the training loop behavior for the PyTorch :class:`~transformers.Trainer` is to use
+:doc:`callbacks <callback>` that can inspect the training loop state (for progress reporting, logging on TensorBoard or
+other ML platforms...) and take decisions (like early stopping).
+

 Trainer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -47,29 +62,472 @@ Trainer
 .. autoclass:: transformers.Trainer
    :members:

+
+Seq2SeqTrainer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Seq2SeqTrainer
+    :members: evaluate, predict
+
+
 TFTrainer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFTrainer
    :members:

+
 TrainingArguments
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TrainingArguments
    :members:

+
+Seq2SeqTrainingArguments
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.Seq2SeqTrainingArguments
+    :members:
+
+
 TFTrainingArguments
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFTrainingArguments
    :members:

-Utilities
+
+Trainer Integrations
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.EvalPrediction

-.. autofunction:: transformers.set_seed

-.. autofunction:: transformers.torch_distributed_zero_first
+The :class:`~transformers.Trainer` has been extended to support libraries that may dramatically improve your training
+time and fit much bigger models.
+
+Currently it supports third party solutions, `DeepSpeed <https://github.com/microsoft/DeepSpeed>`__ and `FairScale
+<https://github.com/facebookresearch/fairscale/>`__, which implement parts of the paper `ZeRO: Memory Optimizations
+Toward Training Trillion Parameter Models, by Samyam Rajbhandari, Jeff Rasley, Olatunji Ruwase, Yuxiong He
+<https://arxiv.org/abs/1910.02054>`__.
+
+This provided support is new and experimental as of this writing.
+
+You will need at least 2 GPUs to benefit from these features.
+
+FairScale
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+By integrating `FairScale <https://github.com/facebookresearch/fairscale/>`__ the :class:`~transformers.Trainer`
+provides support for the following features from `the ZeRO paper <https://arxiv.org/abs/1910.02054>`__:
+
+1. Optimizer State Sharding
+2. Gradient Sharding
+
+To deploy this feature:
+
+1. Install the library via pypi:
+
+   .. code-block:: bash
+
+       pip install fairscale
+
+   or find more details on `the FairScale's github page
+   <https://github.com/facebookresearch/fairscale/#installation>`__.
+
+2. Add ``--sharded_ddp`` to the command line arguments, and make sure you have added the distributed launcher ``-m
+   torch.distributed.launch --nproc_per_node=NUMBER_OF_GPUS_YOU_HAVE`` if you haven't been using it already.
+
+For example here is how you could use it for ``finetune_trainer.py`` with 2 GPUs:
+
+.. code-block:: bash
+
+    cd examples/seq2seq
+    python -m torch.distributed.launch --nproc_per_node=2 ./finetune_trainer.py \
+    --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
+    --output_dir output_dir --overwrite_output_dir \
+    --do_train --n_train 500 --num_train_epochs 1 \
+    --per_device_train_batch_size 1  --freeze_embeds \
+    --src_lang en_XX --tgt_lang ro_RO --task translation \
+    --fp16 --sharded_ddp
+
+Notes:
+
+- This feature requires distributed training (so multiple GPUs).
+- It is not implemented for TPUs.
+- It works with ``--fp16`` too, to make things even faster.
+- One of the main benefits of enabling ``--sharded_ddp`` is that it uses a lot less GPU memory, so you should be able
+  to use significantly larger batch sizes using the same hardware (e.g. 3x and even bigger) which should lead to
+  significantly shorter training time.
+
+
+DeepSpeed
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+
+`DeepSpeed <https://github.com/microsoft/DeepSpeed>`__ implements everything described in the `ZeRO paper
+<https://arxiv.org/abs/1910.02054>`__, except ZeRO's stage 3. "Parameter Partitioning (Pos+g+p)". Currently it provides
+full support for:
+
+1. Optimizer State Partitioning (ZeRO stage 1)
+2. Add Gradient Partitioning (ZeRO stage 2)
+
+To deploy this feature:
+
+1. Install the library via pypi:
+
+   .. code-block:: bash
+
+       pip install deepspeed
+
+   or find more details on `the DeepSpeed's github page <https://github.com/microsoft/deepspeed#installation>`__.
+
+2. Adjust the :class:`~transformers.Trainer` command line arguments as following:
+
+   1. replace ``python -m torch.distributed.launch`` with ``deepspeed``.
+   2. add a new argument ``--deepspeed ds_config.json``, where ``ds_config.json`` is the DeepSpeed configuration file
+      as documented `here <https://www.deepspeed.ai/docs/config-json/>`__. The file naming is up to you.
+
+   Therefore, if your original command line looked as following:
+
+   .. code-block:: bash
+
+       python -m torch.distributed.launch --nproc_per_node=2 your_program.py <normal cl args>
+
+   Now it should be:
+
+   .. code-block:: bash
+
+       deepspeed --num_gpus=2 your_program.py <normal cl args> --deepspeed ds_config.json
+
+   Unlike, ``torch.distributed.launch`` where you have to specify how many GPUs to use with ``--nproc_per_node``, with
+   the ``deepspeed`` launcher you don't have to use the corresponding ``--num_gpus`` if you want all of your GPUs used.
+   The full details on how to configure various nodes and GPUs can be found `here
+   <https://www.deepspeed.ai/getting-started/#resource-configuration-multi-node>`__.
+
+   Here is an example of running ``finetune_trainer.py`` under DeepSpeed deploying all available GPUs:
+
+   .. code-block:: bash
+
+       cd examples/seq2seq
+       deepspeed ./finetune_trainer.py --deepspeed ds_config.json \
+       --model_name_or_path sshleifer/distill-mbart-en-ro-12-4 --data_dir wmt_en_ro \
+       --output_dir output_dir --overwrite_output_dir \
+       --do_train --n_train 500 --num_train_epochs 1 \
+       --per_device_train_batch_size 1  --freeze_embeds \
+       --src_lang en_XX --tgt_lang ro_RO --task translation
+
+   Note that in the DeepSpeed documentation you are likely to see ``--deepspeed --deepspeed_config ds_config.json`` -
+   i.e. two DeepSpeed-related arguments, but for the sake of simplicity, and since there are already so many arguments
+   to deal with, we combined the two into a single argument.
+
+Before you can deploy DeepSpeed, let's discuss its configuration.
+
+**Configuration:**
+
+For the complete guide to the DeepSpeed configuration options that can be used in its configuration file please refer
+to the `following documentation <https://www.deepspeed.ai/docs/config-json/>`__.
+
+While you always have to supply the DeepSpeed configuration file, you can configure the DeepSpeed integration in
+several ways:
+
+1. Supply most of the configuration inside the file, and just use a few required command line arguments. This is the
+   recommended way as it puts most of the configuration params in one place.
+2. Supply just the ZeRO configuration params inside the file, and configure the rest using the normal
+   :class:`~transformers.Trainer` command line arguments.
+3. Any variation of the first two ways.
+
+To get an idea of what DeepSpeed configuration file looks like, here is one that activates ZeRO stage 2 features,
+enables FP16, uses AdamW optimizer and WarmupLR scheduler:
+
+.. code-block:: json
+
+    {
+        "fp16": {
+            "enabled": true,
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
+
+       "zero_optimization": {
+           "stage": 2,
+           "allgather_partitions": true,
+           "allgather_bucket_size": 5e8,
+           "overlap_comm": true,
+           "reduce_scatter": true,
+           "reduce_bucket_size": 5e8,
+           "contiguous_gradients": true,
+           "cpu_offload": true
+       },
+
+       "optimizer": {
+         "type": "AdamW",
+         "params": {
+           "lr": 3e-5,
+           "betas": [ 0.8, 0.999 ],
+           "eps": 1e-8,
+           "weight_decay": 3e-7
+         }
+       },
+       "zero_allow_untested_optimizer": true,
+
+       "scheduler": {
+         "type": "WarmupLR",
+         "params": {
+           "warmup_min_lr": 0,
+           "warmup_max_lr": 3e-5,
+           "warmup_num_steps": 500
+         }
+       }
+    }
+
+If you already have a command line that you have been using with :class:`transformers.Trainer` args, you can continue
+using those and the :class:`~transformers.Trainer` will automatically convert them into the corresponding DeepSpeed
+configuration at run time. For example, you could use the following configuration file:
+
+.. code-block:: json
+
+    {
+       "zero_optimization": {
+           "stage": 2,
+           "allgather_partitions": true,
+           "allgather_bucket_size": 5e8,
+           "overlap_comm": true,
+           "reduce_scatter": true,
+           "reduce_bucket_size": 5e8,
+           "contiguous_gradients": true,
+           "cpu_offload": true
+       }
+    }
+
+and the following command line arguments:
+
+.. code-block:: bash
+
+    --learning_rate 3e-5 --warmup_steps 500 --adam_beta1 0.8 --adam_beta2 0.999 --adam_epsilon 1e-8 \
+    --weight_decay 3e-7 --lr_scheduler_type constant_with_warmup --fp16 --fp16_backend amp
+
+to achieve the same configuration as provided by the longer json file in the first example.
+
+When you execute the program, DeepSpeed will log the configuration it received from the :class:`~transformers.Trainer`
+to the console, so you can see exactly what the final configuration was passed to it.
+
+**Shared Configuration:**
+
+Some configuration information is required by both the :class:`~transformers.Trainer` and DeepSpeed to function
+correctly, therefore, to prevent conflicting definitions, which could lead to hard to detect errors, we chose to
+configure those via the :class:`~transformers.Trainer` command line arguments.
+
+Therefore, the following DeepSpeed configuration params shouldn't be used with the :class:`~transformers.Trainer`:
+
+* ``train_batch_size``
+* ``train_micro_batch_size_per_gpu``
+* ``gradient_accumulation_steps``
+
+as these will be automatically derived from the run time environment and the following 2 command line arguments:
+
+.. code-block:: bash
+
+    --per_device_train_batch_size 8 --gradient_accumulation_steps 2
+
+which are always required to be supplied.
+
+Of course, you will need to adjust the values in this example to your situation.
+
+
+
+**ZeRO:**
+
+The ``zero_optimization`` section of the configuration file is the most important part (`docs
+<https://www.deepspeed.ai/docs/config-json/#zero-optimizations-for-fp16-training>`__), since that is where you define
+which ZeRO stages you want to enable and how to configure them.
+
+.. code-block:: json
+
+    {
+       "zero_optimization": {
+           "stage": 2,
+           "allgather_partitions": true,
+           "allgather_bucket_size": 5e8,
+           "overlap_comm": true,
+           "reduce_scatter": true,
+           "reduce_bucket_size": 5e8,
+           "contiguous_gradients": true,
+           "cpu_offload": true
+       }
+    }
+
+Notes:
+
+- enabling ``cpu_offload`` should reduce GPU RAM usage (it requires ``"stage": 2``)
+- ``"overlap_comm": true`` trades off increased GPU RAM usage to lower all-reduce latency. ``overlap_comm`` uses 4.5x
+  the ``allgather_bucket_size`` and ``reduce_bucket_size`` values. So if they are set to 5e8, this requires a 9GB
+  footprint (``5e8 x 2Bytes x 2 x 4.5``). Therefore, if you have a GPU with 8GB or less RAM, to avoid getting
+  OOM-errors you will need to reduce those parameters to about ``2e8``, which would require 3.6GB.
+
+This section has to be configured exclusively via DeepSpeed configuration - the :class:`~transformers.Trainer` provides
+no equivalent command line arguments.
+
+
+
+**Optimizer:**
+
+
+DeepSpeed's main optimizers are Adam, OneBitAdam, and Lamb. These have been thoroughly tested with ZeRO and are thus
+recommended to be used. It, however, can import other optimizers from ``torch``. The full documentation is `here
+<https://www.deepspeed.ai/docs/config-json/#optimizer-parameters>`__.
+
+If you don't configure the ``optimizer`` entry in the configuration file, the :class:`~transformers.Trainer` will
+automatically set it to ``AdamW`` and will use the supplied values or the defaults for the following command line
+arguments: ``--learning_rate``, ``--adam_beta1``, ``--adam_beta2``, ``--adam_epsilon`` and ``--weight_decay``.
+
+Here is an example of the pre-configured ``optimizer`` entry for AdamW:
+
+.. code-block:: json
+
+    {
+       "zero_allow_untested_optimizer": true,
+       "optimizer": {
+           "type": "AdamW",
+           "params": {
+             "lr": 0.001,
+             "betas": [0.8, 0.999],
+             "eps": 1e-8,
+             "weight_decay": 3e-7
+           }
+         }
+    }
+
+Since AdamW isn't on the list of tested with DeepSpeed/ZeRO optimizers, we have to add
+``zero_allow_untested_optimizer`` flag.
+
+If you want to use one of the officially supported optimizers, configure them explicitly in the configuration file, and
+make sure to adjust the values. e.g. if use Adam you will want ``weight_decay`` around ``0.01``.
+
+
+**Scheduler:**
+
+DeepSpeed supports LRRangeTest, OneCycle, WarmupLR and WarmupDecayLR LR schedulers. The full documentation is `here
+<https://www.deepspeed.ai/docs/config-json/#scheduler-parameters>`__.
+
+If you don't configure the ``scheduler`` entry in the configuration file, the :class:`~transformers.Trainer` will use
+the value of ``--lr_scheduler_type`` to configure it. Currently the :class:`~transformers.Trainer` supports only 2 LR
+schedulers that are also supported by DeepSpeed:
+
+* ``WarmupLR`` via ``--lr_scheduler_type constant_with_warmup``
+* ``WarmupDecayLR`` via ``--lr_scheduler_type linear``. This is also the default value for ``--lr_scheduler_type``,
+  therefore, if you don't configure the scheduler this is scheduler that will get configured by default.
+
+In either case, the values of ``--learning_rate`` and ``--warmup_steps`` will be used for the configuration.
+
+In other words, if you don't use the configuration file to set the ``scheduler`` entry, provide either:
+
+.. code-block:: bash
+
+    --lr_scheduler_type constant_with_warmup --learning_rate 3e-5 --warmup_steps 500
+
+or
+
+.. code-block:: bash
+
+    --lr_scheduler_type linear --learning_rate 3e-5 --warmup_steps 500
+
+with the desired values. If you don't pass these arguments, reasonable default values will be used instead.
+
+In the case of WarmupDecayLR ``total_num_steps`` gets set either via the ``--max_steps`` command line argument, or if
+it is not provided, derived automatically at run time based on the environment and the size of the dataset and other
+command line arguments.
+
+Here is an example of the pre-configured ``scheduler`` entry for WarmupLR (``constant_with_warmup`` in the
+:class:`~transformers.Trainer` API):
+
+.. code-block:: json
+
+    {
+       "scheduler": {
+             "type": "WarmupLR",
+             "params": {
+                 "warmup_min_lr": 0,
+                 "warmup_max_lr": 0.001,
+                 "warmup_num_steps": 1000
+             }
+         }
+    }
+
+**Automatic Mixed Precision:**
+
+You can work with FP16 in one of the following ways:
+
+1. Pytorch native amp, as documented `here <https://www.deepspeed.ai/docs/config-json/#fp16-training-options>`__.
+2. NVIDIA's apex, as documented `here
+   <https://www.deepspeed.ai/docs/config-json/#automatic-mixed-precision-amp-training-options>`__.
+
+If you want to use an equivalent of the pytorch native amp, you can either configure the ``fp16`` entry in the
+configuration file, or use the following command line arguments: ``--fp16 --fp16_backend amp``.
+
+Here is an example of the ``fp16`` configuration:
+
+.. code-block:: json
+
+    {
+        "fp16": {
+            "enabled": true,
+            "loss_scale": 0,
+            "loss_scale_window": 1000,
+            "hysteresis": 2,
+            "min_loss_scale": 1
+        },
+    }
+
+If you want to use NVIDIA's apex instead, you can can either configure the ``amp`` entry in the configuration file, or
+use the following command line arguments: ``--fp16 --fp16_backend apex --fp16_opt_level 01``.
+
+Here is an example of the ``amp`` configuration:
+
+.. code-block:: json
+
+    {
+        "amp": {
+            "enabled": true,
+            "opt_level": "O1"
+        }
+    }
+
+
+
+**Gradient Clipping:**
+
+If you don't configure the ``gradient_clipping`` entry in the configuration file, the :class:`~transformers.Trainer`
+will use the value of the ``--max_grad_norm`` command line argument to set it.
+
+Here is an example of the ``gradient_clipping`` configuration:
+
+.. code-block:: json
+
+    {
+        "gradient_clipping": 1.0,
+    }
+
+
+
+**Notes:**
+
+* DeepSpeed works with the PyTorch :class:`~transformers.Trainer` but not TF :class:`~transformers.TFTrainer`.
+* While DeepSpeed has a pip installable PyPI package, it is highly recommended that it gets installed from `source
+  <https://github.com/microsoft/deepspeed#installation>`__ to best match your hardware and also if you need to enable
+  certain features, like 1-bit Adam, which aren't available in the pypi distribution.
+* You don't have to use the :class:`~transformers.Trainer` to use DeepSpeed with HuggingFace ``transformers`` - you can
+  use any model with your own trainer, and you will have to adapt the latter according to `the DeepSpeed integration
+  instructions <https://www.deepspeed.ai/getting-started/#writing-deepspeed-models>`__.
+
+**Main DeepSpeed Resources:**
+
+- `github <https://github.com/microsoft/deepspeed>`__
+- `Usage docs <https://www.deepspeed.ai/getting-started/>`__
+- `API docs <https://deepspeed.readthedocs.io/en/latest/index.html>`__
+
+Finally, please, remember that, HuggingFace :class:`~transformers.Trainer` only integrates DeepSpeed, therefore if you
+have any problems or questions with regards to DeepSpeed usage, please, file an issue with `DeepSpeed github
+<https://github.com/microsoft/DeepSpeed/issues>`__.
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -1,5 +1,186 @@
+<!---
+Copyright 2020 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
 # Migrating from previous packages

+## Migrating from transformers `v3.x` to `v4.x`
+
+A couple of changes were introduced when the switch from version 3 to version 4 was done. Below is a summary of the
+expected changes:
+
+#### 1. AutoTokenizers and pipelines now use fast (rust) tokenizers by default.
+
+The python and rust tokenizers have roughly the same API, but the rust tokenizers have a more complete feature set. 
+
+This introduces two breaking changes:
+- The handling of overflowing tokens between the python and rust tokenizers is different.
+- The rust tokenizers do not accept integers in the encoding methods.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+- The pipelines now contain additional features out of the box. See the [token-classification pipeline with the `grouped_entities` flag](https://huggingface.co/transformers/main_classes/pipelines.html?highlight=textclassification#tokenclassificationpipeline).
+- The auto-tokenizers now return rust tokenizers. In order to obtain the python tokenizers instead, the user may use the `use_fast` flag by setting it to `False`:
+
+In version `v3.x`:
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
+```
+to obtain the same in version `v4.x`:
+```py
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=False)
+```
+
+#### 2. SentencePiece is removed from the required dependencies
+
+The requirement on the SentencePiece dependency has been lifted from the `setup.py`. This is done so that we may have a channel on anaconda cloud without relying on `conda-forge`. This means that the tokenizers that depend on the SentencePiece library will not be available with a standard `transformers` installation.
+
+This includes the **slow** versions of:
+- `XLNetTokenizer`
+- `AlbertTokenizer`
+- `CamembertTokenizer`
+- `MBartTokenizer`
+- `PegasusTokenizer`
+- `T5Tokenizer`
+- `ReformerTokenizer`
+- `XLMRobertaTokenizer`
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should install `sentencepiece` additionally:
+
+In version `v3.x`:
+```bash
+pip install transformers
+```
+to obtain the same in version `v4.x`:
+```bash
+pip install transformers[sentencepiece]
+```
+or
+```bash
+pip install transformers sentencepiece
+```
+#### 3. The architecture of the repo has been updated so that each model resides in its folder
+
+The past and foreseeable addition of new models means that the number of files in the directory `src/transformers` keeps growing and becomes harder to navigate and understand. We made the choice to put each model and the files accompanying it in their own sub-directories.
+
+This is a breaking change as importing intermediary layers using a model's module directly needs to be done via a different path.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should update the path used to access the layers. 
+
+In version `v3.x`:
+```bash
+from transformers.modeling_bert import BertLayer
+```
+to obtain the same in version `v4.x`:
+```bash
+from transformers.models.bert.modeling_bert import BertLayer
+```
+
+#### 4. Switching the `return_dict` argument to `True` by default
+
+The [`return_dict` argument](https://huggingface.co/transformers/main_classes/output.html) enables the return of dict-like python objects containing the model outputs, instead of the standard tuples. This object is self-documented as keys can be used to retrieve values, while also behaving as a tuple as users may retrieve objects by index or by slice.
+
+This is a breaking change as the limitation of that tuple is that it cannot be unpacked: `value0, value1 = outputs` will not work.
+
+##### How to obtain the same behavior as v3.x in v4.x
+
+In order to obtain the same behavior as version `v3.x`, you should specify the `return_dict` argument to `False`, either in the model configuration or during the forward pass.
+
+In version `v3.x`:
+```bash
+model = BertModel.from_pretrained("bert-base-cased")
+outputs = model(**inputs)
+```
+to obtain the same in version `v4.x`:
+```bash
+model = BertModel.from_pretrained("bert-base-cased")
+outputs = model(**inputs, return_dict=False)
+```
+or
+```bash
+model = BertModel.from_pretrained("bert-base-cased", return_dict=False)
+outputs = model(**inputs)
+```
+
+#### 5. Removed some deprecated attributes
+
+Attributes that were deprecated have been removed if they had been deprecated for at least a month. The full list of deprecated attributes can be found in [#8604](https://github.com/huggingface/transformers/pull/8604).
+
+Here is a list of these attributes/methods/arguments and what their replacements should be:
+
+In several models, the labels become consistent with the other models:
+- `masked_lm_labels` becomes `labels` in `AlbertForMaskedLM` and `AlbertForPreTraining`.
+- `masked_lm_labels` becomes `labels` in `BertForMaskedLM` and `BertForPreTraining`.
+- `masked_lm_labels` becomes `labels` in `DistilBertForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `ElectraForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `LongformerForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `MobileBertForMaskedLM`.
+- `masked_lm_labels` becomes `labels` in `RobertaForMaskedLM`.
+- `lm_labels` becomes `labels` in `BartForConditionalGeneration`.
+- `lm_labels` becomes `labels` in `GPT2DoubleHeadsModel`.
+- `lm_labels` becomes `labels` in `OpenAIGPTDoubleHeadsModel`.
+- `lm_labels` becomes `labels` in `T5ForConditionalGeneration`.
+
+In several models, the caching mechanism becomes consistent with the other models:
+- `decoder_cached_states` becomes `past_key_values` in all BART-like, FSMT and T5 models.
+- `decoder_past_key_values` becomes `past_key_values` in all BART-like, FSMT and T5 models.
+- `past` becomes `past_key_values` in all CTRL models.
+- `past` becomes `past_key_values` in all GPT-2 models.
+
+Regarding the tokenizer classes:
+- The tokenizer attribute `max_len` becomes `model_max_length`.
+- The tokenizer attribute `return_lengths` becomes `return_length`.
+- The tokenizer encoding argument `is_pretokenized` becomes `is_split_into_words`.
+
+Regarding the `Trainer` class:
+- The `Trainer` argument `tb_writer` is removed in favor of the callback `TensorBoardCallback(tb_writer=...)`.
+- The `Trainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
+- The `Trainer` attribute `data_collator` should be a callable.
+- The `Trainer` method `_log` is deprecated in favor of `log`.
+- The `Trainer` method `_training_step` is deprecated in favor of `training_step`.
+- The `Trainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
+- The `Trainer` method `is_local_master` is deprecated in favor of `is_local_process_zero`.
+- The `Trainer` method `is_world_master` is deprecated in favor of `is_world_process_zero`.
+
+Regarding the `TFTrainer` class:
+- The `TFTrainer` argument `prediction_loss_only` is removed in favor of the class argument `args.prediction_loss_only`.
+- The `Trainer` method `_log` is deprecated in favor of `log`.
+- The `TFTrainer` method `_prediction_loop` is deprecated in favor of `prediction_loop`.
+- The `TFTrainer` method `_setup_wandb` is deprecated in favor of `setup_wandb`.
+- The `TFTrainer` method `_run_model` is deprecated in favor of `run_model`.
+
+Regarding the `TrainerArgument` class:
+- The `TrainerArgument` argument `evaluate_during_training` is deprecated in favor of `evaluation_strategy`.
+
+Regarding the Transfo-XL model:
+- The Transfo-XL configuration attribute `tie_weight` becomes `tie_words_embeddings`.
+- The Transfo-XL modeling method `reset_length` becomes `reset_memory_length`.
+
+Regarding pipelines:
+- The `FillMaskPipeline` argument `topk` becomes `top_k`.
+
+
+
 ## Migrating from pytorch-transformers to 🤗 Transformers

 Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to 🤗 Transformers.
@@ -20,7 +201,7 @@ Here is a quick summary of what you should take care of when migrating from `pyt

 The main breaking change when migrating from `pytorch-pretrained-bert` to 🤗 Transformers is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.

-The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
+The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).

 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.

@@ -109,7 +290,7 @@ for batch in train_data:
    loss.backward()
    optimizer.step()

-### In 🤗 Transformers, optimizer and schedules are splitted and instantiated like this:
+### In 🤗 Transformers, optimizer and schedules are split and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr, correct_bias=False)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler
 ### and used like this:
--- a/docs/source/model_doc/albert.rst
+++ b/docs/source/model_doc/albert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 ALBERT
 -----------------------------------------------------------------------------------------------------------------------

@@ -19,14 +31,14 @@ downstream tasks. However, at some point further model increases become harder d
 longer training times, and unexpected model degradation. To address these problems, we present two parameter-reduction
 techniques to lower memory consumption and increase the training speed of BERT. Comprehensive empirical evidence shows
 that our proposed methods lead to models that scale much better compared to the original BERT. We also use a
-self-supervised loss that focuses on modeling inter-sentence coherence, and show it consistently helps downstream
-tasks with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE,
-RACE, and SQuAD benchmarks while having fewer parameters compared to BERT-large.*
+self-supervised loss that focuses on modeling inter-sentence coherence, and show it consistently helps downstream tasks
+with multi-sentence inputs. As a result, our best model establishes new state-of-the-art results on the GLUE, RACE, and
+SQuAD benchmarks while having fewer parameters compared to BERT-large.*

 Tips:

- ALBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
-  the right rather than the left.
+- ALBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
+  than the left.
 - ALBERT uses repeating layers which results in a small memory footprint, however the computational cost remains
  similar to a BERT-like architecture with the same number of hidden layers as it has to iterate through the same
  number of (repeating) layers.
@@ -48,13 +60,20 @@ AlbertTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+AlbertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AlbertTokenizerFast
+    :members:
+
+
 Albert specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_albert.AlbertForPreTrainingOutput
+.. autoclass:: transformers.models.albert.modeling_albert.AlbertForPreTrainingOutput
    :members:

-.. autoclass:: transformers.modeling_tf_albert.TFAlbertForPreTrainingOutput
+.. autoclass:: transformers.models.albert.modeling_tf_albert.TFAlbertForPreTrainingOutput
    :members:


--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -1,10 +1,21 @@
-AutoClasses
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Auto Classes
 -----------------------------------------------------------------------------------------------------------------------

 In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you
-are supplying to the :obj:`from_pretrained()` method.
-AutoClasses are here to do this job for you so that you automatically retrieve the relevant model given the name/path
-to the pretrained weights/config/vocabulary.
+are supplying to the :obj:`from_pretrained()` method. AutoClasses are here to do this job for you so that you
+automatically retrieve the relevant model given the name/path to the pretrained weights/config/vocabulary.

 Instantiating one of :class:`~transformers.AutoConfig`, :class:`~transformers.AutoModel`, and
 :class:`~transformers.AutoTokenizer` will directly create a class of the relevant architecture. For instance
@@ -47,10 +58,24 @@ AutoModelForPreTraining
    :members:


-AutoModelWithLMHead
+AutoModelForCausalLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.AutoModelWithLMHead
+.. autoclass:: transformers.AutoModelForCausalLM
+    :members:
+
+
+AutoModelForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForMaskedLM
+    :members:
+
+
+AutoModelForSeq2SeqLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForSeq2SeqLM
    :members:


@@ -68,6 +93,13 @@ AutoModelForMultipleChoice
    :members:


+AutoModelForNextSentencePrediction
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForNextSentencePrediction
+    :members:
+
+
 AutoModelForTokenClassification
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -82,6 +114,13 @@ AutoModelForQuestionAnswering
    :members:


+AutoModelForTableQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AutoModelForTableQuestionAnswering
+    :members:
+
+
 TFAutoModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -96,10 +135,24 @@ TFAutoModelForPreTraining
    :members:


-TFAutoModelWithLMHead
+TFAutoModelForCausalLM
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.TFAutoModelWithLMHead
+.. autoclass:: transformers.TFAutoModelForCausalLM
+    :members:
+
+
+TFAutoModelForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForMaskedLM
+    :members:
+
+
+TFAutoModelForSeq2SeqLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForSeq2SeqLM
    :members:


@@ -129,3 +182,10 @@ TFAutoModelForQuestionAnswering

 .. autoclass:: transformers.TFAutoModelForQuestionAnswering
    :members:
+
+
+FlaxAutoModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxAutoModel
+    :members:
--- a/docs/source/model_doc/bart.rst
+++ b/docs/source/model_doc/bart.rst
@@ -1,38 +1,85 @@
-Bart
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BART
 -----------------------------------------------------------------------------------------------------------------------
-**DISCLAIMER:** If you see something strange,
-file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
-@sshleifer
+
+**DISCLAIMER:** If you see something strange, file a `Github Issue
+<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+@patrickvonplaten

 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The Bart model was `proposed <https://arxiv.org/abs/1910.13461>`_ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
+The Bart model was proposed in `BART: Denoising Sequence-to-Sequence Pre-training for Natural Language Generation,
+Translation, and Comprehension <https://arxiv.org/abs/1910.13461>`__ by Mike Lewis, Yinhan Liu, Naman Goyal, Marjan
+Ghazvininejad, Abdelrahman Mohamed, Omer Levy, Ves Stoyanov and Luke Zettlemoyer on 29 Oct, 2019.
+
 According to the abstract,

- Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a left-to-right decoder (like GPT).
- The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme, where spans of text are replaced with a single mask token.
- BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains of up to 6 ROUGE.
+- Bart uses a standard seq2seq/machine translation architecture with a bidirectional encoder (like BERT) and a
+  left-to-right decoder (like GPT).
+- The pretraining task involves randomly shuffling the order of the original sentences and a novel in-filling scheme,
+  where spans of text are replaced with a single mask token.
+- BART is particularly effective when fine tuned for text generation but also works well for comprehension tasks. It
+  matches the performance of RoBERTa with comparable training resources on GLUE and SQuAD, achieves new
+  state-of-the-art results on a range of abstractive dialogue, question answering, and summarization tasks, with gains
+  of up to 6 ROUGE.

-The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/bart>`_
+The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/bart>`__.
+
+
+Examples
+_______________________________________________________________________________________________________________________
+
+- Examples and scripts for fine-tuning BART and other models for sequence to sequence tasks can be found in
+  :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
+- An example of how to train :class:`~transformers.BartForConditionalGeneration` with a Hugging Face :obj:`datasets`
+  object can be found in this `forum discussion
+  <https://discuss.huggingface.co/t/train-bart-for-conditional-generation-e-g-summarization/1904>`__.
+- `Distilled checkpoints <https://huggingface.co/models?search=distilbart>`__ are described in this `paper
+  <https://arxiv.org/abs/2010.13002>`__.


 Implementation Notes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

- Bart doesn't use :obj:`token_type_ids` for sequence classification. Use BartTokenizer.encode to get the proper splitting.
- The forward pass of ``BartModel`` will create decoder inputs (using the helper function ``transformers.modeling_bart._prepare_bart_decoder_inputs``)  if they are not passed. This is different than some other modeling APIs.
- Model predictions are intended to be identical to the original implementation. This only works, however, if the string you pass to ``fairseq.encode`` starts with a space.
- ``BartForConditionalGeneration.generate`` should be used for conditional generation tasks like summarization, see the example in that docstrings
- Models that load the ``"facebook/bart-large-cnn"`` weights will not have a ``mask_token_id``, or be able to perform mask filling tasks.
- for training/forward passes that don't involve beam search, pass ``use_cache=False``
+- Bart doesn't use :obj:`token_type_ids` for sequence classification. Use :class:`~transformers.BartTokenizer` or
+  :meth:`~transformers.BartTokenizer.encode` to get the proper splitting.
+- The forward pass of :class:`~transformers.BartModel` will create the ``decoder_input_ids`` if they are not passed.
+  This is different than some other modeling APIs. A typical use case of this feature is mask filling.
+- Model predictions are intended to be identical to the original implementation when
+  :obj:`force_bos_token_to_be_generated=True`. This only works, however, if the string you pass to
+  :func:`fairseq.encode` starts with a space.
+- :meth:`~transformers.BartForConditionalGeneration.generate` should be used for conditional generation tasks like
+  summarization, see the example in that docstrings.
+- Models that load the `facebook/bart-large-cnn` weights will not have a :obj:`mask_token_id`, or be able to perform
+  mask-filling tasks.

-
-BartForConditionalGeneration
+Mask Filling
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.BartForConditionalGeneration
-    :members: forward
+The :obj:`facebook/bart-base` and :obj:`facebook/bart-large` checkpoints can be used to fill multi-token masks.
+
+.. code-block::
+
+    from transformers import BartForConditionalGeneration, BartTokenizer
+    model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", force_bos_token_to_be_generated=True)
+    tok = BartTokenizer.from_pretrained("facebook/bart-large")
+    example_english_phrase = "UN Chief Says There Is No <mask> in Syria"
+    batch = tok(example_english_phrase, return_tensors='pt')
+    generated_ids = model.generate(batch['input_ids'])
+    assert tok.batch_decode(generated_ids, skip_special_tokens=True) == ['UN Chief Says There Is No Plan to Stop Chemical Weapons in Syria']
+


 BartConfig
@@ -49,6 +96,12 @@ BartTokenizer
    :members:


+BartTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartTokenizerFast
+    :members:
+

 BartModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -56,7 +109,12 @@ BartModel
 .. autoclass:: transformers.BartModel
    :members: forward

-.. autofunction:: transformers.modeling_bart._prepare_bart_decoder_inputs
+
+BartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BartForConditionalGeneration
+    :members: forward


 BartForSequenceClassification
@@ -73,3 +131,16 @@ BartForQuestionAnswering
    :members: forward


+
+TFBartModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBartModel
+    :members: call
+
+
+TFBartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBartForConditionalGeneration
+    :members: call
--- a/docs/source/model_doc/barthez.rst
+++ b/docs/source/model_doc/barthez.rst
@@ -0,0 +1,59 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+BARThez
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BARThez model was proposed in `BARThez: a Skilled Pretrained French Sequence-to-Sequence Model`
+<https://arxiv.org/abs/2010.12321>`__ by Moussa Kamal Eddine, Antoine J.-P. Tixier, Michalis Vazirgiannis on 23 Oct,
+2020.
+
+The abstract of the paper:
+
+
+*Inductive transfer learning, enabled by self-supervised learning, have taken the entire Natural Language Processing
+(NLP) field by storm, with models such as BERT and BART setting new state of the art on countless natural language
+understanding tasks. While there are some notable exceptions, most of the available models and research have been
+conducted for the English language. In this work, we introduce BARThez, the first BART model for the French language
+(to the best of our knowledge). BARThez was pretrained on a very large monolingual French corpus from past research
+that we adapted to suit BART's perturbation schemes. Unlike already existing BERT-based French language models such as
+CamemBERT and FlauBERT, BARThez is particularly well-suited for generative tasks, since not only its encoder but also
+its decoder is pretrained. In addition to discriminative tasks from the FLUE benchmark, we evaluate BARThez on a novel
+summarization dataset, OrangeSum, that we release with this paper. We also continue the pretraining of an already
+pretrained multilingual BART on BARThez's corpus, and we show that the resulting model, which we call mBARTHez,
+provides a significant boost over vanilla BARThez, and is on par with or outperforms CamemBERT and FlauBERT.*
+
+The Authors' code can be found `here <https://github.com/moussaKam/BARThez>`__.
+
+
+Examples
+_______________________________________________________________________________________________________________________
+
+- BARThez can be fine-tuned on sequence-to-sequence tasks in a similar way as BART, check:
+  :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
+
+
+BarthezTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BarthezTokenizer
+    :members:
+
+
+BarthezTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BarthezTokenizerFast
+    :members:
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 BERT
 -----------------------------------------------------------------------------------------------------------------------

@@ -25,8 +37,8 @@ improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement).*

 Tips:

- BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
-  the right rather than the left.
+- BERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
 - BERT was trained with the masked language modeling (MLM) and next sentence prediction (NSP) objectives. It is
  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation.

@@ -57,10 +69,10 @@ BertTokenizerFast
 Bert specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_bert.BertForPreTrainingOutput
+.. autoclass:: transformers.models.bert.modeling_bert.BertForPreTrainingOutput
    :members:

-.. autoclass:: transformers.modeling_tf_bert.TFBertForPreTrainingOutput
+.. autoclass:: transformers.models.bert.modeling_tf_bert.TFBertForPreTrainingOutput
    :members:


@@ -188,3 +200,17 @@ TFBertForQuestionAnswering

 .. autoclass:: transformers.TFBertForQuestionAnswering
    :members: call
+
+
+FlaxBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBertModel
+    :members: __call__
+
+
+FlaxBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxBertForMaskedLM
+    :members: __call__
--- a/docs/source/model_doc/bertgeneration.rst
+++ b/docs/source/model_doc/bertgeneration.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 BertGeneration
 -----------------------------------------------------------------------------------------------------------------------

@@ -10,7 +22,7 @@ Tasks <https://arxiv.org/abs/1907.12461>`__ by Sascha Rothe, Shashi Narayan, Ali

 The abstract from the paper is the following:

-*Unsupervised pre-training of large neural models has recently revolutionized Natural Language Processing. By
+*Unsupervised pretraining of large neural models has recently revolutionized Natural Language Processing. By
 warm-starting from the publicly released checkpoints, NLP practitioners have pushed the state-of-the-art on multiple
 benchmarks while saving significant amounts of compute time. So far the focus has been mainly on the Natural Language
 Understanding tasks. In this paper, we demonstrate the efficacy of pre-trained checkpoints for Sequence Generation. We
@@ -24,15 +36,15 @@ Usage:
 - The model can be used in combination with the :class:`~transformers.EncoderDecoderModel` to leverage two pretrained
  BERT checkpoints for subsequent fine-tuning.

-:: code-block
-  
+.. code-block::
+
  # leverage checkpoints for Bert2Bert model...
  # use BERT's cls token as BOS token and sep token as EOS token
  encoder = BertGenerationEncoder.from_pretrained("bert-large-uncased", bos_token_id=101, eos_token_id=102)
  # add cross attention layers and use BERT's cls token as BOS token and sep token as EOS token
  decoder = BertGenerationDecoder.from_pretrained("bert-large-uncased", add_cross_attention=True, is_decoder=True, bos_token_id=101, eos_token_id=102)
  bert2bert = EncoderDecoderModel(encoder=encoder, decoder=decoder)
-  
+
  # create tokenizer...
  tokenizer = BertTokenizer.from_pretrained("bert-large-uncased")

@@ -40,14 +52,14 @@ Usage:
  labels = tokenizer('This is a short summary', return_tensors="pt").input_ids

  # train...
-  loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels, return_dict=True).loss
+  loss = bert2bert(input_ids=input_ids, decoder_input_ids=labels, labels=labels).loss
  loss.backward()


 - Pretrained :class:`~transformers.EncoderDecoderModel` are also directly available in the model hub, e.g.,


-:: code-block
+.. code-block::

  # instantiate sentence fusion model
  sentence_fuser = EncoderDecoderModel.from_pretrained("google/roberta2roberta_L-24_discofuse")
--- a/docs/source/model_doc/bertweet.rst
+++ b/docs/source/model_doc/bertweet.rst
@@ -0,0 +1,64 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Bertweet
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The BERTweet model was proposed in `BERTweet: A pre-trained language model for English Tweets
+<https://www.aclweb.org/anthology/2020.emnlp-demos.2.pdf>`__ by Dat Quoc Nguyen, Thanh Vu, Anh Tuan Nguyen.
+
+The abstract from the paper is the following:
+
+*We present BERTweet, the first public large-scale pre-trained language model for English Tweets. Our BERTweet, having
+the same architecture as BERT-base (Devlin et al., 2019), is trained using the RoBERTa pre-training procedure (Liu et
+al., 2019). Experiments show that BERTweet outperforms strong baselines RoBERTa-base and XLM-R-base (Conneau et al.,
+2020), producing better performance results than the previous state-of-the-art models on three Tweet NLP tasks:
+Part-of-speech tagging, Named-entity recognition and text classification.*
+
+Example of use:
+
+.. code-block::
+
+  import torch
+  from transformers import AutoModel, AutoTokenizer 
+
+  bertweet = AutoModel.from_pretrained("vinai/bertweet-base")
+
+  # For transformers v4.x+: 
+  tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
+
+  # For transformers v3.x: 
+  # tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")
+
+  # INPUT TWEET IS ALREADY NORMALIZED!
+  line = "SC has first two presumptive cases of coronavirus , DHEC confirms HTTPURL via @USER :cry:"
+
+  input_ids = torch.tensor([tokenizer.encode(line)])
+
+  with torch.no_grad():
+      features = bertweet(input_ids)  # Models outputs are now tuples
+
+  ## With TensorFlow 2.0+:
+  # from transformers import TFAutoModel
+  # bertweet = TFAutoModel.from_pretrained("vinai/bertweet-base")
+
+
+The original code can be found `here <https://github.com/VinAIResearch/BERTweet>`__.
+
+BertweetTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BertweetTokenizer
+    :members: 
--- a/docs/source/model_doc/blenderbot.rst
+++ b/docs/source/model_doc/blenderbot.rst
@@ -0,0 +1,112 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Blenderbot
+-----------------------------------------------------------------------------------------------------------------------
+
+**DISCLAIMER:** If you see something strange, file a `Github Issue
+<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ .
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Blender chatbot model was proposed in `Recipes for building an open-domain chatbot
+<https://arxiv.org/pdf/2004.13637.pdf>`__ Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu,
+Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston on 30 Apr 2020.
+
+The abstract of the paper is the following:
+
+*Building open-domain chatbots is a challenging area for machine learning research. While prior work has shown that
+scaling neural models in the number of parameters and the size of the data they are trained on gives improved results,
+we show that other ingredients are important for a high-performing chatbot. Good conversation requires a number of
+skills that an expert conversationalist blends in a seamless way: providing engaging talking points and listening to
+their partners, and displaying knowledge, empathy and personality appropriately, while maintaining a consistent
+persona. We show that large scale models can learn these skills when given appropriate training data and choice of
+generation strategy. We build variants of these recipes with 90M, 2.7B and 9.4B parameter models, and make our models
+and code publicly available. Human evaluations show our best models are superior to existing approaches in multi-turn
+dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
+failure cases of our models.*
+
+The authors' code can be found `here <https://github.com/facebookresearch/ParlAI>`__ .
+
+
+Implementation Notes
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Blenderbot uses a standard `seq2seq model transformer <https://arxiv.org/pdf/1706.03762.pdf>`__ based architecture.
+- Available checkpoints can be found in the `model hub <https://huggingface.co/models?search=blenderbot>`__.
+- This is the `default` Blenderbot model class. However, some smaller checkpoints, such as
+  ``facebook/blenderbot_small_90M``, have a different architecture and consequently should be used with
+  `BlenderbotSmall <https://huggingface.co/transformers/master/model_doc/blenderbot_small.html>`__.
+
+
+Usage
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here is an example of model usage:
+
+.. code-block::
+
+        >>> from transformers import BlenderbotTokenizer, BlenderbotForConditionalGeneration
+        >>> mname = 'facebook/blenderbot-400M-distill'
+        >>> model = BlenderbotForConditionalGeneration.from_pretrained(mname)
+        >>> tokenizer = BlenderbotTokenizer.from_pretrained(mname)
+        >>> UTTERANCE = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer([UTTERANCE], return_tensors='pt')
+        >>> reply_ids = model.generate(**inputs)
+        >>> print(tokenizer.batch_decode(reply_ids))
+        ["<s> That's unfortunate. Are they trying to lose weight or are they just trying to be healthier?</s>"]
+
+
+BlenderbotConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotConfig
+    :members:
+
+BlenderbotTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotTokenizer
+    :members: build_inputs_with_special_tokens
+
+
+BlenderbotModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See :obj:`transformers.BartModel` for arguments to `forward` and `generate`
+
+.. autoclass:: transformers.BlenderbotModel
+    :members: forward
+
+
+BlenderbotForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+See :obj:`transformers.BartForConditionalGeneration` for arguments to `forward` and `generate`
+
+.. autoclass:: transformers.BlenderbotForConditionalGeneration
+    :members: forward
+
+
+TFBlenderbotModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBlenderbotModel
+    :members: call
+
+
+TFBlenderbotForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBlenderbotForConditionalGeneration
+    :members: call
--- a/docs/source/model_doc/blenderbot_small.rst
+++ b/docs/source/model_doc/blenderbot_small.rst
@@ -0,0 +1,84 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+Blenderbot Small
+-----------------------------------------------------------------------------------------------------------------------
+
+Note that :class:`~transformers.BlenderbotSmallModel` and
+:class:`~transformers.BlenderbotSmallForConditionalGeneration` are only used in combination with the checkpoint
+`facebook/blenderbot-90M <https://huggingface.co/facebook/blenderbot-90M>`__. Larger Blenderbot checkpoints should
+instead be used with :class:`~transformers.BlenderbotModel` and
+:class:`~transformers.BlenderbotForConditionalGeneration`
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The Blender chatbot model was proposed in `Recipes for building an open-domain chatbot
+<https://arxiv.org/pdf/2004.13637.pdf>`__ Stephen Roller, Emily Dinan, Naman Goyal, Da Ju, Mary Williamson, Yinhan Liu,
+Jing Xu, Myle Ott, Kurt Shuster, Eric M. Smith, Y-Lan Boureau, Jason Weston on 30 Apr 2020.
+
+The abstract of the paper is the following:
+
+*Building open-domain chatbots is a challenging area for machine learning research. While prior work has shown that
+scaling neural models in the number of parameters and the size of the data they are trained on gives improved results,
+we show that other ingredients are important for a high-performing chatbot. Good conversation requires a number of
+skills that an expert conversationalist blends in a seamless way: providing engaging talking points and listening to
+their partners, and displaying knowledge, empathy and personality appropriately, while maintaining a consistent
+persona. We show that large scale models can learn these skills when given appropriate training data and choice of
+generation strategy. We build variants of these recipes with 90M, 2.7B and 9.4B parameter models, and make our models
+and code publicly available. Human evaluations show our best models are superior to existing approaches in multi-turn
+dialogue in terms of engagingness and humanness measurements. We then discuss the limitations of this work by analyzing
+failure cases of our models.*
+
+The authors' code can be found `here <https://github.com/facebookresearch/ParlAI>`__ .
+
+BlenderbotSmallConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallConfig
+    :members:
+
+
+BlenderbotSmallTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+BlenderbotSmallModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallModel
+    :members: forward
+
+
+BlenderbotSmallForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.BlenderbotSmallForConditionalGeneration
+    :members: forward
+
+
+TFBlenderbotSmallModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBlenderbotSmallModel
+    :members: call
+
+
+TFBlenderbotSmallForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFBlenderbotSmallForConditionalGeneration
+    :members: call
--- a/docs/source/model_doc/camembert.rst
+++ b/docs/source/model_doc/camembert.rst
@@ -1,29 +1,41 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 CamemBERT
 -----------------------------------------------------------------------------------------------------------------------

 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The CamemBERT model was proposed in `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`__
-by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
+The CamemBERT model was proposed in `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`__ by
+Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suárez, Yoann Dupont, Laurent Romary, Éric Villemonte de la
 Clergerie, Djamé Seddah, and Benoît Sagot. It is based on Facebook's RoBERTa model released in 2019. It is a model
 trained on 138GB of French text.

 The abstract from the paper is the following:

-*Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success,
-most available models have either been trained on English data or on the concatenation of data in multiple
-languages. This makes practical use of such models --in all languages except English-- very limited. Aiming
-to address this issue for French, we release CamemBERT, a French version of the Bi-directional Encoders for
-Transformers (BERT). We measure the performance of CamemBERT compared to multilingual models in multiple
-downstream tasks, namely part-of-speech tagging, dependency parsing, named-entity recognition, and natural
-language inference. CamemBERT improves the state of the art for most of the tasks considered. We release the
-pretrained model for CamemBERT hoping to foster research and downstream applications for French NLP.*
+*Pretrained language models are now ubiquitous in Natural Language Processing. Despite their success, most available
+models have either been trained on English data or on the concatenation of data in multiple languages. This makes
+practical use of such models --in all languages except English-- very limited. Aiming to address this issue for French,
+we release CamemBERT, a French version of the Bi-directional Encoders for Transformers (BERT). We measure the
+performance of CamemBERT compared to multilingual models in multiple downstream tasks, namely part-of-speech tagging,
+dependency parsing, named-entity recognition, and natural language inference. CamemBERT improves the state of the art
+for most of the tasks considered. We release the pretrained model for CamemBERT hoping to foster research and
+downstream applications for French NLP.*

 Tips:

- This implementation is the same as RoBERTa. Refer to the :doc:`documentation of RoBERTa <roberta>` for usage
-  examples as well as the information relative to the inputs and outputs.
+- This implementation is the same as RoBERTa. Refer to the :doc:`documentation of RoBERTa <roberta>` for usage examples
+  as well as the information relative to the inputs and outputs.

 The original code can be found `here <https://camembert-model.fr/>`__.

@@ -42,6 +54,13 @@ CamembertTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+CamembertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CamembertTokenizerFast
+    :members:
+
+
 CamembertModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -130,4 +149,4 @@ TFCamembertForQuestionAnswering
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.TFCamembertForQuestionAnswering
-    :members:
+    :members:
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 CTRL
 -----------------------------------------------------------------------------------------------------------------------

@@ -6,33 +18,33 @@ Overview

 CTRL model was proposed in `CTRL: A Conditional Transformer Language Model for Controllable Generation
 <https://arxiv.org/abs/1909.05858>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and
-Richard Socher. It's a causal (unidirectional) transformer pre-trained using language modeling on a very large
-corpus of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).
+Richard Socher. It's a causal (unidirectional) transformer pre-trained using language modeling on a very large corpus
+of ~140 GB of text data with the first token reserved as a control code (such as Links, Books, Wikipedia etc.).

 The abstract from the paper is the following:

 *Large-scale language models show promising text generation capabilities, but users cannot easily control particular
 aspects of the generated text. We release CTRL, a 1.63 billion-parameter conditional transformer language model,
 trained to condition on control codes that govern style, content, and task-specific behavior. Control codes were
-derived from structure that naturally co-occurs with raw text, preserving the advantages of unsupervised learning
-while providing more explicit control over text generation. These codes also allow CTRL to predict which parts of
-the training data are most likely given a sequence. This provides a potential method for analyzing large amounts
-of data via model-based source attribution.*
+derived from structure that naturally co-occurs with raw text, preserving the advantages of unsupervised learning while
+providing more explicit control over text generation. These codes also allow CTRL to predict which parts of the
+training data are most likely given a sequence. This provides a potential method for analyzing large amounts of data
+via model-based source attribution.*

 Tips:

 - CTRL makes use of control codes to generate text: it requires generations to be started by certain words, sentences
-  or links to generate coherent text. Refer to the `original implementation <https://github.com/salesforce/ctrl>`__
-  for more information.
- CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on
-  the right rather than the left.
+  or links to generate coherent text. Refer to the `original implementation <https://github.com/salesforce/ctrl>`__ for
+  more information.
+- CTRL is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
 - CTRL was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
-  token in a sequence. Leveraging this feature allows CTRL to generate syntactically coherent text as
-  it can be observed in the `run_generation.py` example script.
+  token in a sequence. Leveraging this feature allows CTRL to generate syntactically coherent text as it can be
+  observed in the `run_generation.py` example script.
 - The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
-  this `past` value prevents the model from re-computing pre-computed values in the context of text generation.
-  See `reusing the past in generative models <../quickstart.html#using-the-past>`__ for more information on the usage
-  of this argument.
+  this `past` value prevents the model from re-computing pre-computed values in the context of text generation. See
+  `reusing the past in generative models <../quickstart.html#using-the-past>`__ for more information on the usage of
+  this argument.

 The original code can be found `here <https://github.com/salesforce/ctrl>`__.

@@ -65,6 +77,13 @@ CTRLLMHeadModel
    :members: forward


+CTRLForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.CTRLForSequenceClassification
+    :members: forward
+
+
 TFCTRLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -78,3 +97,8 @@ TFCTRLLMHeadModel
 .. autoclass:: transformers.TFCTRLLMHeadModel
    :members: call

+TFCTRLForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFCTRLForSequenceClassification
+    :members: call
--- a/docs/source/model_doc/deberta.rst
+++ b/docs/source/model_doc/deberta.rst
@@ -0,0 +1,77 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+DeBERTa
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The DeBERTa model was proposed in `DeBERTa: Decoding-enhanced BERT with Disentangled Attention
+<https://arxiv.org/abs/2006.03654>`__ by Pengcheng He, Xiaodong Liu, Jianfeng Gao, Weizhu Chen It is based on Google's
+BERT model released in 2018 and Facebook's RoBERTa model released in 2019.
+
+It builds on RoBERTa with disentangled attention and enhanced mask decoder training with half of the data used in
+RoBERTa.
+
+The abstract from the paper is the following:
+
+*Recent progress in pre-trained neural language models has significantly improved the performance of many natural
+language processing (NLP) tasks. In this paper we propose a new model architecture DeBERTa (Decoding-enhanced BERT with
+disentangled attention) that improves the BERT and RoBERTa models using two novel techniques. The first is the
+disentangled attention mechanism, where each word is represented using two vectors that encode its content and
+position, respectively, and the attention weights among words are computed using disentangled matrices on their
+contents and relative positions. Second, an enhanced mask decoder is used to replace the output softmax layer to
+predict the masked tokens for model pretraining. We show that these two techniques significantly improve the efficiency
+of model pretraining and performance of downstream tasks. Compared to RoBERTa-Large, a DeBERTa model trained on half of
+the training data performs consistently better on a wide range of NLP tasks, achieving improvements on MNLI by +0.9%
+(90.2% vs. 91.1%), on SQuAD v2.0 by +2.3% (88.4% vs. 90.7%) and RACE by +3.6% (83.2% vs. 86.8%). The DeBERTa code and
+pre-trained models will be made publicly available at https://github.com/microsoft/DeBERTa.*
+
+
+The original code can be found `here <https://github.com/microsoft/DeBERTa>`__.
+
+
+DebertaConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaConfig
+    :members:
+
+
+DebertaTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+DebertaModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaModel
+    :members:
+
+
+DebertaPreTrainedModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaPreTrainedModel
+    :members:
+
+
+DebertaForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.DebertaForSequenceClassification
+    :members:
--- a/docs/source/model_doc/dialogpt.rst
+++ b/docs/source/model_doc/dialogpt.rst
@@ -1,39 +1,54 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 DialoGPT
 -----------------------------------------------------------------------------------------------------------------------

 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-DialoGPT was proposed in
-`DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation <https://arxiv.org/abs/1911.00536>`_
-by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao, Jianfeng Gao, Jingjing Liu, Bill Dolan.
-It's a GPT2 Model trained on 147M conversation-like exchanges extracted from Reddit.
+DialoGPT was proposed in `DialoGPT: Large-Scale Generative Pre-training for Conversational Response Generation
+<https://arxiv.org/abs/1911.00536>`_ by Yizhe Zhang, Siqi Sun, Michel Galley, Yen-Chun Chen, Chris Brockett, Xiang Gao,
+Jianfeng Gao, Jingjing Liu, Bill Dolan. It's a GPT2 Model trained on 147M conversation-like exchanges extracted from
+Reddit.

 The abstract from the paper is the following:

-*We present a large, tunable neural conversational response generation model, DialoGPT (dialogue generative pre-trained transformer). 
-Trained on 147M conversation-like exchanges extracted from Reddit comment chains over a period spanning from 2005 through 2017, DialoGPT extends the Hugging Face PyTorch transformer to attain a performance close to human both in terms of automatic and human evaluation in single-turn dialogue settings.
-We show that conversational systems that leverage DialoGPT generate more relevant, contentful and context-consistent responses than strong baseline systems.
-The pre-trained model and training pipeline are publicly released to facilitate research into neural response generation and the development of more intelligent open-domain dialogue systems.*
+*We present a large, tunable neural conversational response generation model, DialoGPT (dialogue generative pre-trained
+transformer). Trained on 147M conversation-like exchanges extracted from Reddit comment chains over a period spanning
+from 2005 through 2017, DialoGPT extends the Hugging Face PyTorch transformer to attain a performance close to human
+both in terms of automatic and human evaluation in single-turn dialogue settings. We show that conversational systems
+that leverage DialoGPT generate more relevant, contentful and context-consistent responses than strong baseline
+systems. The pre-trained model and training pipeline are publicly released to facilitate research into neural response
+generation and the development of more intelligent open-domain dialogue systems.*

 Tips:

- DialoGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
-  the right rather than the left.
- DialoGPT was trained with a causal language modeling (CLM) objective on conversational data and is therefore powerful at response generation in open-domain dialogue systems.
- DialoGPT enables the user to create a chat bot in just 10 lines of code as shown on `DialoGPT's model card <https://huggingface.co/microsoft/DialoGPT-medium>`_.
+- DialoGPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
+  than the left.
+- DialoGPT was trained with a causal language modeling (CLM) objective on conversational data and is therefore powerful
+  at response generation in open-domain dialogue systems.
+- DialoGPT enables the user to create a chat bot in just 10 lines of code as shown on `DialoGPT's model card
+  <https://huggingface.co/microsoft/DialoGPT-medium>`_.

 Training:

-In order to train or fine-tune DialoGPT, one can use causal language modeling training. 
-To cite the official paper: 
-*We follow the OpenAI GPT-2 to model a multiturn dialogue session 
-as a long text and frame the generation task as language modeling. We first
-concatenate all dialog turns within a dialogue session into a long text 
-x_1,..., x_N (N is the sequence length), ended by the end-of-text token.* 
-For more information please confer to the original paper.
-    
+In order to train or fine-tune DialoGPT, one can use causal language modeling training. To cite the official paper: *We
+follow the OpenAI GPT-2 to model a multiturn dialogue session as a long text and frame the generation task as language
+modeling. We first concatenate all dialog turns within a dialogue session into a long text x_1,..., x_N (N is the
+sequence length), ended by the end-of-text token.* For more information please confer to the original paper.

-DialoGPT's architecture is based on the GPT2 model, so one can refer to GPT2's `docstring <https://huggingface.co/transformers/model_doc/gpt2.html>`_.
+
+DialoGPT's architecture is based on the GPT2 model, so one can refer to GPT2's `docstring
+<https://huggingface.co/transformers/model_doc/gpt2.html>`_.

 The original code can be found `here <https://github.com/microsoft/DialoGPT>`_.
--- a/docs/source/model_doc/distilbert.rst
+++ b/docs/source/model_doc/distilbert.rst
@@ -1,16 +1,27 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 DistilBERT
 -----------------------------------------------------------------------------------------------------------------------

 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The DistilBERT model was proposed in the blog post
-`Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT
-<https://medium.com/huggingface/distilbert-8cf3380435b5>`__, and the paper `DistilBERT, a distilled version of BERT:
-smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__.
-DistilBERT is a small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less
-parameters than `bert-base-uncased`, runs 60% faster while preserving over 95% of BERT's performances as measured on
-the GLUE language understanding benchmark.
+The DistilBERT model was proposed in the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a
+distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`__, and the paper `DistilBERT, a
+distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`__. DistilBERT is a
+small, fast, cheap and light Transformer model trained by distilling BERT base. It has 40% less parameters than
+`bert-base-uncased`, runs 60% faster while preserving over 95% of BERT's performances as measured on the GLUE language
+understanding benchmark.

 The abstract from the paper is the following:

@@ -18,13 +29,13 @@ The abstract from the paper is the following:
 operating these large models in on-the-edge and/or under constrained computational training or inference budgets
 remains challenging. In this work, we propose a method to pre-train a smaller general-purpose language representation
 model, called DistilBERT, which can then be fine-tuned with good performances on a wide range of tasks like its larger
-counterparts. While most prior work investigated the use of distillation for building task-specific models, we
-leverage knowledge distillation during the pre-training phase and show that it is possible to reduce the size of a
-BERT model by 40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage
-the inductive biases learned by larger models during pre-training, we introduce a triple loss combining language
-modeling, distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train
-and we demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative
-on-device study.*
+counterparts. While most prior work investigated the use of distillation for building task-specific models, we leverage
+knowledge distillation during the pretraining phase and show that it is possible to reduce the size of a BERT model by
+40%, while retaining 97% of its language understanding capabilities and being 60% faster. To leverage the inductive
+biases learned by larger models during pretraining, we introduce a triple loss combining language modeling,
+distillation and cosine-distance losses. Our smaller, faster and lighter model is cheaper to pre-train and we
+demonstrate its capabilities for on-device computations in a proof-of-concept experiment and a comparative on-device
+study.*

 Tips:

@@ -33,7 +44,8 @@ Tips:
 - DistilBERT doesn't have options to select the input positions (:obj:`position_ids` input). This could be added if
  necessary though, just let us know if you need this option.

-The original code can be found `here <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__.
+The original code can be found `here
+<https://github.com/huggingface/transformers/tree/master/examples/distillation>`__.


 DistilBertConfig
--- a/docs/source/model_doc/dpr.rst
+++ b/docs/source/model_doc/dpr.rst
@@ -1,12 +1,24 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 DPR
 -----------------------------------------------------------------------------------------------------------------------

 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Dense Passage Retrieval (DPR) is a set of tools and models for state-of-the-art open-domain Q&A research.
-It was intorduced in `Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`__
-by Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih.
+Dense Passage Retrieval (DPR) is a set of tools and models for state-of-the-art open-domain Q&A research. It was
+introduced in `Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`__ by
+Vladimir Karpukhin, Barlas Oğuz, Sewon Min, Patrick Lewis, Ledell Wu, Sergey Edunov, Danqi Chen, Wen-tau Yih.

 The abstract from the paper is the following:

@@ -71,13 +83,13 @@ DPRReaderTokenizerFast
 DPR specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_dpr.DPRContextEncoderOutput
+.. autoclass:: transformers.models.dpr.modeling_dpr.DPRContextEncoderOutput
    :members:

-.. autoclass:: transformers.modeling_dpr.DPRQuestionEncoderOutput
+.. autoclass:: transformers.models.dpr.modeling_dpr.DPRQuestionEncoderOutput
    :members:

-.. autoclass:: transformers.modeling_dpr.DPRReaderOutput
+.. autoclass:: transformers.models.dpr.modeling_dpr.DPRReaderOutput
    :members:


@@ -99,3 +111,22 @@ DPRReader

 .. autoclass:: transformers.DPRReader
    :members: forward
+
+TFDPRContextEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDPRContextEncoder
+    :members: call
+
+TFDPRQuestionEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDPRQuestionEncoder
+    :members: call
+
+
+TFDPRReader
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFDPRReader
+    :members: call
--- a/docs/source/model_doc/electra.rst
+++ b/docs/source/model_doc/electra.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 ELECTRA
 -----------------------------------------------------------------------------------------------------------------------

@@ -12,34 +24,28 @@ identify which tokens were replaced by the generator in the sequence.

 The abstract from the paper is the following:

-*Masked language modeling (MLM) pre-training methods such as BERT corrupt
-the input by replacing some tokens with [MASK] and then train a model to
-reconstruct the original tokens. While they produce good results when transferred
-to downstream NLP tasks, they generally require large amounts of compute to be
-effective. As an alternative, we propose a more sample-efficient pre-training task
-called replaced token detection. Instead of masking the input, our approach
-corrupts it by replacing some tokens with plausible alternatives sampled from a small
-generator network. Then, instead of training a model that predicts the original
-identities of the corrupted tokens, we train a discriminative model that predicts
-whether each token in the corrupted input was replaced by a generator sample
-or not. Thorough experiments demonstrate this new pre-training task is more
-efficient than MLM because the task is defined over all input tokens rather than
-just the small subset that was masked out. As a result, the contextual representations
-learned by our approach substantially outperform the ones learned by BERT
-given the same model size, data, and compute. The gains are particularly strong
-for small models; for example, we train a model on one GPU for 4 days that
-outperforms GPT (trained using 30x more compute) on the GLUE natural language
-understanding benchmark. Our approach also works well at scale, where it
-performs comparably to RoBERTa and XLNet while using less than 1/4 of their
-compute and outperforms them when using the same amount of compute.*
+*Masked language modeling (MLM) pretraining methods such as BERT corrupt the input by replacing some tokens with [MASK]
+and then train a model to reconstruct the original tokens. While they produce good results when transferred to
+downstream NLP tasks, they generally require large amounts of compute to be effective. As an alternative, we propose a
+more sample-efficient pretraining task called replaced token detection. Instead of masking the input, our approach
+corrupts it by replacing some tokens with plausible alternatives sampled from a small generator network. Then, instead
+of training a model that predicts the original identities of the corrupted tokens, we train a discriminative model that
+predicts whether each token in the corrupted input was replaced by a generator sample or not. Thorough experiments
+demonstrate this new pretraining task is more efficient than MLM because the task is defined over all input tokens
+rather than just the small subset that was masked out. As a result, the contextual representations learned by our
+approach substantially outperform the ones learned by BERT given the same model size, data, and compute. The gains are
+particularly strong for small models; for example, we train a model on one GPU for 4 days that outperforms GPT (trained
+using 30x more compute) on the GLUE natural language understanding benchmark. Our approach also works well at scale,
+where it performs comparably to RoBERTa and XLNet while using less than 1/4 of their compute and outperforms them when
+using the same amount of compute.*

 Tips:

 - ELECTRA is the pretraining approach, therefore there is nearly no changes done to the underlying model: BERT. The
  only change is the separation of the embedding size and the hidden size: the embedding size is generally smaller,
-  while the hidden size is larger. An additional projection layer (linear) is used to project the embeddings from
-  their embedding size to the hidden size. In the case where the embedding size is the same as the hidden size, no
-  projection layer is used.
+  while the hidden size is larger. An additional projection layer (linear) is used to project the embeddings from their
+  embedding size to the hidden size. In the case where the embedding size is the same as the hidden size, no projection
+  layer is used.
 - The ELECTRA checkpoints saved using `Google Research's implementation <https://github.com/google-research/electra>`__
  contain both the generator and discriminator. The conversion script requires the user to name which model to export
  into the correct architecture. Once converted to the HuggingFace format, these checkpoints may be loaded into all
@@ -75,10 +81,10 @@ ElectraTokenizerFast
 Electra specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_electra.ElectraForPreTrainingOutput
+.. autoclass:: transformers.models.electra.modeling_electra.ElectraForPreTrainingOutput
    :members:

-.. autoclass:: transformers.modeling_tf_electra.TFElectraForPreTrainingOutput
+.. autoclass:: transformers.models.electra.modeling_tf_electra.TFElectraForPreTrainingOutput
    :members:


--- a/docs/source/model_doc/encoderdecoder.rst
+++ b/docs/source/model_doc/encoderdecoder.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Encoder Decoder Models
 -----------------------------------------------------------------------------------------------------------------------

@@ -13,7 +25,7 @@ any other models (see the examples for more information).

 An application of this architecture could be to leverage two pretrained :class:`~transformers.BertModel` as the encoder
 and decoder for a summarization model as was shown in: `Text Summarization with Pretrained Encoders
-<https://arxiv.org/abs/1908.08345>`__ by Yang Liu and Mirella Lapata. 
+<https://arxiv.org/abs/1908.08345>`__ by Yang Liu and Mirella Lapata.


 EncoderDecoderConfig
@@ -27,4 +39,4 @@ EncoderDecoderModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.EncoderDecoderModel
-    :members: forward
+    :members: forward, from_encoder_decoder_pretrained
--- a/docs/source/model_doc/flaubert.rst
+++ b/docs/source/model_doc/flaubert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 FlauBERT
 -----------------------------------------------------------------------------------------------------------------------

@@ -11,17 +23,17 @@ modeling (MLM) objective (like BERT).
 The abstract from the paper is the following:

 *Language models have become a key step to achieve state-of-the art results in many different Natural Language
-Processing (NLP) tasks. Leveraging the huge amount of unlabeled texts nowadays available, they provide an efficient
-way to pre-train continuous word representations that can be fine-tuned for a downstream task, along with their
+Processing (NLP) tasks. Leveraging the huge amount of unlabeled texts nowadays available, they provide an efficient way
+to pre-train continuous word representations that can be fine-tuned for a downstream task, along with their
 contextualization at the sentence level. This has been widely demonstrated for English using contextualized
-representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018; Radford et al., 2018; Devlin et
-al., 2019; Yang et al., 2019b). In this paper, we introduce and share FlauBERT, a model learned on a very large
-and heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre
-for Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
-classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most
-of the time they outperform other pre-training approaches. Different versions of FlauBERT as well as a unified
-evaluation protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared
-to the research community for further reproducible experiments in French NLP.*
+representations (Dai and Le, 2015; Peters et al., 2018; Howard and Ruder, 2018; Radford et al., 2018; Devlin et al.,
+2019; Yang et al., 2019b). In this paper, we introduce and share FlauBERT, a model learned on a very large and
+heterogeneous French corpus. Models of different sizes are trained using the new CNRS (French National Centre for
+Scientific Research) Jean Zay supercomputer. We apply our French language models to diverse NLP tasks (text
+classification, paraphrasing, natural language inference, parsing, word sense disambiguation) and show that most of the
+time they outperform other pretraining approaches. Different versions of FlauBERT as well as a unified evaluation
+protocol for the downstream tasks, called FLUE (French Language Understanding Evaluation), are shared to the research
+community for further reproducible experiments in French NLP.*

 The original code can be found `here <https://github.com/getalp/Flaubert>`__.

--- a/docs/source/model_doc/fsmt.rst
+++ b/docs/source/model_doc/fsmt.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 FSMT
 -----------------------------------------------------------------------------------------------------------------------

@@ -58,4 +70,4 @@ FSMTForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.FSMTForConditionalGeneration
-    :members: forward
+    :members: forward
--- a/docs/source/model_doc/funnel.rst
+++ b/docs/source/model_doc/funnel.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Funnel Transformer
 -----------------------------------------------------------------------------------------------------------------------

@@ -30,8 +42,8 @@ Tips:
  directly for tasks that just require a sentence summary (like sequence classification or multiple choice). For other
  tasks, the full model is used; this full model has a decoder that upsamples the final hidden states to the same
  sequence length as the input.
- The Funnel Transformer checkpoints are all available with a full version and a base version. The first ones should
-  be used for :class:`~transformers.FunnelModel`, :class:`~transformers.FunnelForPreTraining`,
+- The Funnel Transformer checkpoints are all available with a full version and a base version. The first ones should be
+  used for :class:`~transformers.FunnelModel`, :class:`~transformers.FunnelForPreTraining`,
  :class:`~transformers.FunnelForMaskedLM`, :class:`~transformers.FunnelForTokenClassification` and
  class:`~transformers.FunnelForQuestionAnswering`. The second ones should be used for
  :class:`~transformers.FunnelBaseModel`, :class:`~transformers.FunnelForSequenceClassification` and
@@ -65,10 +77,10 @@ FunnelTokenizerFast
 Funnel specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_funnel.FunnelForPreTrainingOutput
+.. autoclass:: transformers.models.funnel.modeling_funnel.FunnelForPreTrainingOutput
    :members:

-.. autoclass:: transformers.modeling_tf_funnel.TFFunnelForPreTrainingOutput
+.. autoclass:: transformers.models.funnel.modeling_tf_funnel.TFFunnelForPreTrainingOutput
    :members:


--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 OpenAI GPT
 -----------------------------------------------------------------------------------------------------------------------

@@ -6,51 +18,47 @@ Overview

 OpenAI GPT model was proposed in `Improving Language Understanding by Generative Pre-Training
 <https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf>`__
-by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. It's a causal (unidirectional)
-transformer pre-trained using language modeling on a large corpus will long range dependencies, the Toronto Book
-Corpus.
+by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever. It's a causal (unidirectional) transformer
+pre-trained using language modeling on a large corpus will long range dependencies, the Toronto Book Corpus.

 The abstract from the paper is the following:

-*Natural language understanding comprises a wide range of diverse tasks such
-as textual entailment, question answering, semantic similarity assessment, and
-document classification. Although large unlabeled text corpora are abundant,
-labeled data for learning these specific tasks is scarce, making it challenging for
-discriminatively trained models to perform adequately. We demonstrate that large
-gains on these tasks can be realized by generative pre-training of a language model
-on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each
-specific task. In contrast to previous approaches, we make use of task-aware input
-transformations during fine-tuning to achieve effective transfer while requiring
-minimal changes to the model architecture. We demonstrate the effectiveness of
-our approach on a wide range of benchmarks for natural language understanding.
-Our general task-agnostic model outperforms discriminatively trained models that
-use architectures specifically crafted for each task, significantly improving upon the
-state of the art in 9 out of the 12 tasks studied.*
+*Natural language understanding comprises a wide range of diverse tasks such as textual entailment, question answering,
+semantic similarity assessment, and document classification. Although large unlabeled text corpora are abundant,
+labeled data for learning these specific tasks is scarce, making it challenging for discriminatively trained models to
+perform adequately. We demonstrate that large gains on these tasks can be realized by generative pretraining of a
+language model on a diverse corpus of unlabeled text, followed by discriminative fine-tuning on each specific task. In
+contrast to previous approaches, we make use of task-aware input transformations during fine-tuning to achieve
+effective transfer while requiring minimal changes to the model architecture. We demonstrate the effectiveness of our
+approach on a wide range of benchmarks for natural language understanding. Our general task-agnostic model outperforms
+discriminatively trained models that use architectures specifically crafted for each task, significantly improving upon
+the state of the art in 9 out of the 12 tasks studied.*

 Tips:

- GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
-  the right rather than the left.
+- GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
 - GPT was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
-  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as
-  it can be observed in the `run_generation.py` example script.
+  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
+  observed in the `run_generation.py` example script.

-`Write With Transformer <https://transformer.huggingface.co/doc/gpt>`__ is a webapp created and hosted by
-Hugging Face showcasing the generative capabilities of several models. GPT is one of them.
+`Write With Transformer <https://transformer.huggingface.co/doc/gpt>`__ is a webapp created and hosted by Hugging Face
+showcasing the generative capabilities of several models. GPT is one of them.

 The original code can be found `here <https://github.com/openai/finetune-transformer-lm>`__.

 Note:

-If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install 
-``ftfy`` and ``SpaCy``::
+If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install ``ftfy``
+and ``SpaCy``::
+
+.. code-block:: bash

    pip install spacy ftfy==4.4.3
    python -m spacy download en

 If you don't install ``ftfy`` and ``SpaCy``, the :class:`~transformers.OpenAIGPTTokenizer` will default to tokenize
-using BERT's :obj:`BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't 
-worry).
+using BERT's :obj:`BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).

 OpenAIGPTConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -76,10 +84,10 @@ OpenAIGPTTokenizerFast
 OpenAI specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_openai.OpenAIGPTDoubleHeadsModelOutput
+.. autoclass:: transformers.models.openai.modeling_openai.OpenAIGPTDoubleHeadsModelOutput
    :members:

-.. autoclass:: transformers.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput
+.. autoclass:: transformers.models.openai.modeling_tf_openai.TFOpenAIGPTDoubleHeadsModelOutput
    :members:


@@ -104,6 +112,13 @@ OpenAIGPTDoubleHeadsModel
    :members: forward


+OpenAIGPTForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.OpenAIGPTForSequenceClassification
+    :members: forward
+
+
 TFOpenAIGPTModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -123,3 +138,9 @@ TFOpenAIGPTDoubleHeadsModel

 .. autoclass:: transformers.TFOpenAIGPTDoubleHeadsModel
    :members: call
+
+TFOpenAIGPTForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFOpenAIGPTForSequenceClassification
+    :members: call
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 OpenAI GPT2
 -----------------------------------------------------------------------------------------------------------------------

@@ -5,29 +17,29 @@ Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 OpenAI GPT-2 model was proposed in `Language Models are Unsupervised Multitask Learners
-<https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_
-by Alec Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever. It's a causal (unidirectional)
-transformer pretrained using  language modeling on a very large corpus of ~40 GB of text data.
+<https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_ by Alec
+Radford, Jeffrey Wu, Rewon Child, David Luan, Dario Amodei and Ilya Sutskever. It's a causal (unidirectional)
+transformer pretrained using language modeling on a very large corpus of ~40 GB of text data.

 The abstract from the paper is the following:

-*GPT-2 is a large transformer-based language model with 1.5 billion parameters, trained on a dataset[1]
-of 8 million web pages. GPT-2 is trained with a simple objective: predict the next word, given all of the previous
-words within some text. The diversity of the dataset causes this simple goal to contain naturally occurring
-demonstrations of many tasks across diverse domains. GPT-2 is a direct scale-up of GPT, with more than 10X
-the parameters and trained on more than 10X the amount of data.*
+*GPT-2 is a large transformer-based language model with 1.5 billion parameters, trained on a dataset[1] of 8 million
+web pages. GPT-2 is trained with a simple objective: predict the next word, given all of the previous words within some
+text. The diversity of the dataset causes this simple goal to contain naturally occurring demonstrations of many tasks
+across diverse domains. GPT-2 is a direct scale-up of GPT, with more than 10X the parameters and trained on more than
+10X the amount of data.*

 Tips:

- GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
-  the right rather than the left.
+- GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather than
+  the left.
 - GPT-2 was trained with a causal language modeling (CLM) objective and is therefore powerful at predicting the next
-  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as
-  it can be observed in the `run_generation.py` example script.
+  token in a sequence. Leveraging this feature allows GPT-2 to generate syntactically coherent text as it can be
+  observed in the `run_generation.py` example script.
 - The PyTorch models can take the `past` as input, which is the previously computed key/value attention pairs. Using
-  this `past` value prevents the model from re-computing pre-computed values in the context of text generation.
-  See `reusing the past in generative models <../quickstart.html#using-the-past>`__ for more information on the usage
-  of this argument.
+  this `past` value prevents the model from re-computing pre-computed values in the context of text generation. See
+  `reusing the past in generative models <../quickstart.html#using-the-past>`__ for more information on the usage of
+  this argument.

 `Write With Transformer <https://transformer.huggingface.co/doc/gpt2-large>`__ is a webapp created and hosted by
 Hugging Face showcasing the generative capabilities of several models. GPT-2 is one of them and is available in five
@@ -60,10 +72,10 @@ GPT2TokenizerFast
 GPT2 specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_gpt2.GPT2DoubleHeadsModelOutput
+.. autoclass:: transformers.models.gpt2.modeling_gpt2.GPT2DoubleHeadsModelOutput
    :members:

-.. autoclass:: transformers.modeling_tf_gpt2.TFGPT2DoubleHeadsModelOutput
+.. autoclass:: transformers.models.gpt2.modeling_tf_gpt2.TFGPT2DoubleHeadsModelOutput
    :members:


@@ -71,14 +83,14 @@ GPT2Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2Model
-    :members: forward
+    :members: forward, parallelize, deparallelize


 GPT2LMHeadModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.GPT2LMHeadModel
-    :members: forward
+    :members: forward, parallelize, deparallelize


 GPT2DoubleHeadsModel
@@ -88,6 +100,13 @@ GPT2DoubleHeadsModel
    :members: forward


+GPT2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.GPT2ForSequenceClassification
+    :members: forward
+
+
 TFGPT2Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -107,3 +126,15 @@ TFGPT2DoubleHeadsModel

 .. autoclass:: transformers.TFGPT2DoubleHeadsModel
    :members: call
+
+TFGPT2ForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFGPT2ForSequenceClassification
+    :members: call
+
+TFSequenceClassifierOutputWithPast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.modeling_tf_outputs.TFSequenceClassifierOutputWithPast
+    :members:
--- a/docs/source/model_doc/herbert.rst
+++ b/docs/source/model_doc/herbert.rst
@@ -0,0 +1,71 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+herBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The herBERT model was proposed in `KLEJ: Comprehensive Benchmark for Polish Language Understanding
+<https://www.aclweb.org/anthology/2020.acl-main.111.pdf>`__ by Piotr Rybak, Robert Mroczkowski, Janusz Tracz, and
+Ireneusz Gawlik. It is a BERT-based Language Model trained on Polish Corpora using only MLM objective with dynamic
+masking of whole words.
+
+The abstract from the paper is the following:
+
+*In recent years, a series of Transformer-based models unlocked major improvements in general natural language
+understanding (NLU) tasks. Such a fast pace of research would not be possible without general NLU benchmarks, which
+allow for a fair comparison of the proposed methods. However, such benchmarks are available only for a handful of
+languages. To alleviate this issue, we introduce a comprehensive multi-task benchmark for the Polish language
+understanding, accompanied by an online leaderboard. It consists of a diverse set of tasks, adopted from existing
+datasets for named entity recognition, question-answering, textual entailment, and others. We also introduce a new
+sentiment analysis task for the e-commerce domain, named Allegro Reviews (AR). To ensure a common evaluation scheme and
+promote models that generalize to different NLU tasks, the benchmark includes datasets from varying domains and
+applications. Additionally, we release HerBERT, a Transformer-based model trained specifically for the Polish language,
+which has the best average performance and obtains the best results for three out of nine tasks. Finally, we provide an
+extensive evaluation, including several standard baselines and recently proposed, multilingual Transformer-based
+models.*
+
+Examples of use:
+
+.. code-block::
+
+  from transformers import HerbertTokenizer, RobertaModel
+
+  tokenizer = HerbertTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+  model = RobertaModel.from_pretrained("allegro/herbert-klej-cased-v1")
+
+  encoded_input = tokenizer.encode("Kto ma lepszą sztukę, ma lepszy rząd – to jasne.", return_tensors='pt')
+  outputs = model(encoded_input)
+
+  # HerBERT can also be loaded using AutoTokenizer and AutoModel:
+  import torch
+  from transformers import AutoModel, AutoTokenizer
+
+  tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-klej-cased-tokenizer-v1")
+  model = AutoModel.from_pretrained("allegro/herbert-klej-cased-v1")
+
+
+The original code can be found `here <https://github.com/allegro/HerBERT>`__.
+
+HerbertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.HerbertTokenizer
+    :members: 
+
+HerbertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.HerbertTokenizerFast
+    :members: 
--- a/docs/source/model_doc/layoutlm.rst
+++ b/docs/source/model_doc/layoutlm.rst
@@ -1,55 +1,132 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 LayoutLM
----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------
+
+.. _Overview:

 Overview
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The LayoutLM model was proposed in `LayoutLM: Pre-training of Text and Layout for Document Image Understanding <https://arxiv.org/abs/1912.13318>`__
-by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and Ming Zhou. It's a simple but effective pre-training method 
-of text and layout for document image understanding and information extraction tasks, such as form understanding and receipt understanding.
+The LayoutLM model was proposed in the paper `LayoutLM: Pre-training of Text and Layout for Document Image
+Understanding <https://arxiv.org/abs/1912.13318>`__ by Yiheng Xu, Minghao Li, Lei Cui, Shaohan Huang, Furu Wei, and
+Ming Zhou. It's a simple but effective pretraining method of text and layout for document image understanding and
+information extraction tasks, such as form understanding and receipt understanding. It obtains state-of-the-art results
+on several downstream tasks:
+
+- form understanding: the `FUNSD <https://guillaumejaume.github.io/FUNSD/>`__ dataset (a collection of 199 annotated
+  forms comprising more than 30,000 words).
+- receipt understanding: the `SROIE <https://rrc.cvc.uab.es/?ch=13>`__ dataset (a collection of 626 receipts for
+  training and 347 receipts for testing).
+- document image classification: the `RVL-CDIP <https://www.cs.cmu.edu/~aharley/rvl-cdip/>`__ dataset (a collection of
+  400,000 images belonging to one of 16 classes).

 The abstract from the paper is the following:

-*Pre-training techniques have been verified successfully in a variety of NLP tasks in recent years. Despite the widespread use of pre-training models for NLP applications, they almost exclusively focus on text-level manipulation, while neglecting layout and style information that is vital for document image understanding. In this paper, we propose the \textbf{LayoutLM} to jointly model interactions between text and layout information across scanned document images, which is beneficial for a great number of real-world document image understanding tasks such as information extraction from scanned documents. Furthermore, we also leverage image features to incorporate words' visual information into LayoutLM. To the best of our knowledge, this is the first time that text and layout are jointly learned in a single framework for document-level pre-training. It achieves new state-of-the-art results in several downstream tasks, including form understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image classification (from 93.07 to 94.42).*
+*Pre-training techniques have been verified successfully in a variety of NLP tasks in recent years. Despite the
+widespread use of pretraining models for NLP applications, they almost exclusively focus on text-level manipulation,
+while neglecting layout and style information that is vital for document image understanding. In this paper, we propose
+the LayoutLM to jointly model interactions between text and layout information across scanned document images, which is
+beneficial for a great number of real-world document image understanding tasks such as information extraction from
+scanned documents. Furthermore, we also leverage image features to incorporate words' visual information into LayoutLM.
+To the best of our knowledge, this is the first time that text and layout are jointly learned in a single framework for
+document-level pretraining. It achieves new state-of-the-art results in several downstream tasks, including form
+understanding (from 70.72 to 79.27), receipt understanding (from 94.02 to 95.24) and document image classification
+(from 93.07 to 94.42).*

 Tips:

- LayoutLM has an extra input called :obj:`bbox`, which is the bounding boxes of the input tokens.
- The :obj:`bbox` requires the data that on 0-1000 scale, which means you should normalize the bounding box before passing them into model.
+- In addition to `input_ids`, :meth:`~transformer.LayoutLMModel.forward` also expects the input :obj:`bbox`, which are
+  the bounding boxes (i.e. 2D-positions) of the input tokens. These can be obtained using an external OCR engine such
+  as Google's `Tesseract <https://github.com/tesseract-ocr/tesseract>`__ (there's a `Python wrapper
+  <https://pypi.org/project/pytesseract/>`__ available). Each bounding box should be in (x0, y0, x1, y1) format, where
+  (x0, y0) corresponds to the position of the upper left corner in the bounding box, and (x1, y1) represents the
+  position of the lower right corner. Note that one first needs to normalize the bounding boxes to be on a 0-1000
+  scale. To normalize, you can use the following function:
+
+.. code-block::
+
+   def normalize_bbox(bbox, width, height):
+        return [
+            int(1000 * (bbox[0] / width)),
+            int(1000 * (bbox[1] / height)),
+            int(1000 * (bbox[2] / width)),
+            int(1000 * (bbox[3] / height)),
+        ]
+
+Here, :obj:`width` and :obj:`height` correspond to the width and height of the original document in which the token
+occurs. Those can be obtained using the Python Image Library (PIL) library for example, as follows:
+
+.. code-block::
+
+   from PIL import Image
+
+   image = Image.open("name_of_your_document - can be a png file, pdf, etc.")
+
+   width, height = image.size
+
+- For a demo which shows how to fine-tune :class:`LayoutLMForTokenClassification` on the `FUNSD dataset
+  <https://guillaumejaume.github.io/FUNSD/>`__ (a collection of annotated forms), see `this notebook
+  <https://github.com/NielsRogge/Transformers-Tutorials/blob/master/LayoutLM/Fine_tuning_LayoutLMForTokenClassification_on_FUNSD.ipynb>`__.
+  It includes an inference part, which shows how to use Google's Tesseract on a new document.

 The original code can be found `here <https://github.com/microsoft/unilm/tree/master/layoutlm>`_.


 LayoutLMConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.LayoutLMConfig
    :members:


 LayoutLMTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.LayoutLMTokenizer
    :members:


+LayoutLMTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMTokenizerFast
+    :members:
+
+
 LayoutLMModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.LayoutLMModel
    :members:


 LayoutLMForMaskedLM
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.LayoutLMForMaskedLM
    :members:


+LayoutLMForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LayoutLMForSequenceClassification
+    :members:
+
+
 LayoutLMForTokenClassification
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.LayoutLMForTokenClassification
    :members:
--- a/docs/source/model_doc/led.rst
+++ b/docs/source/model_doc/led.rst
@@ -0,0 +1,149 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+LED
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The LED model was proposed in `Longformer: The Long-Document Transformer <https://arxiv.org/abs/2004.05150>`__ by Iz
+Beltagy, Matthew E. Peters, Arman Cohan.
+
+The abstract from the paper is the following:
+
+*Transformer-based models are unable to process long sequences due to their self-attention operation, which scales
+quadratically with the sequence length. To address this limitation, we introduce the Longformer with an attention
+mechanism that scales linearly with sequence length, making it easy to process documents of thousands of tokens or
+longer. Longformer's attention mechanism is a drop-in replacement for the standard self-attention and combines a local
+windowed attention with a task motivated global attention. Following prior work on long-sequence transformers, we
+evaluate Longformer on character-level language modeling and achieve state-of-the-art results on text8 and enwik8. In
+contrast to most prior work, we also pretrain Longformer and finetune it on a variety of downstream tasks. Our
+pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
+WikiHop and TriviaQA. We finally introduce the Longformer-Encoder-Decoder (LED), a Longformer variant for supporting
+long document generative sequence-to-sequence tasks, and demonstrate its effectiveness on the arXiv summarization
+dataset.*
+
+Tips:
+
+- :class:`~transformers.LEDForConditionalGeneration` is an extension of
+  :class:`~transformers.BartForConditionalGeneration` exchanging the traditional *self-attention* layer with
+  *Longformer*'s *chunked self-attention* layer. :class:`~transformers.LEDTokenizer` is an alias of
+  :class:`~transformers.BartTokenizer`.
+- LED works very well on long-range *sequence-to-sequence* tasks where the ``input_ids`` largely exceed a length of
+  1024 tokens.
+- LED pads the ``input_ids`` to be a multiple of ``config.attention_window`` if required. Therefore a small speed-up is
+  gained, when :class:`~transformers.LEDTokenizer` is used with the ``pad_to_multiple_of`` argument.
+- LED makes use of *global attention* by means of the ``global_attention_mask`` (see
+  :class:`~transformers.LongformerModel`). For summarization, it is advised to put *global attention* only on the first
+  ``<s>`` token. For question answering, it is advised to put *global attention* on all tokens of the question.
+- To fine-tune LED on all 16384, it is necessary to enable *gradient checkpointing* by setting
+  ``config.gradient_checkpointing = True``.
+- A notebook showing how to evaluate LED, can be accessed `here
+  <https://colab.research.google.com/drive/12INTTR6n64TzS4RrXZxMSXfrOd9Xzamo?usp=sharing>`__.
+- A notebook showing how to fine-tune LED, can be accessed `here
+  <https://colab.research.google.com/drive/12LjJazBl7Gam0XBPy_y0CTOJZeZ34c2v?usp=sharing>`__.
+
+
+LEDConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDConfig
+    :members:
+
+
+LEDTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+LEDTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDTokenizerFast
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+LED specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.led.modeling_led.LEDEncoderBaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_led.LEDSeq2SeqModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_led.LEDSeq2SeqLMOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_led.LEDSeq2SeqSequenceClassifierOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_led.LEDSeq2SeqQuestionAnsweringModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_tf_led.TFLEDEncoderBaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_tf_led.TFLEDSeq2SeqModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.led.modeling_tf_led.TFLEDSeq2SeqLMOutput
+    :members: 
+
+
+
+
+LEDModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDModel
+    :members: forward
+
+
+LEDForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDForConditionalGeneration
+    :members: forward
+
+
+LEDForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDForSequenceClassification
+    :members: forward
+
+
+LEDForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.LEDForQuestionAnswering
+    :members: forward
+
+
+TFLEDModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLEDModel
+    :members: call
+
+
+TFLEDForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLEDForConditionalGeneration
+    :members: call
--- a/docs/source/model_doc/longformer.rst
+++ b/docs/source/model_doc/longformer.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Longformer
 -----------------------------------------------------------------------------------------------------------------------

@@ -22,25 +34,31 @@ contrast to most prior work, we also pretrain Longformer and finetune it on a va
 pretrained Longformer consistently outperforms RoBERTa on long document tasks and sets new state-of-the-art results on
 WikiHop and TriviaQA.*

+Tips:
+
+- Since the Longformer is based on RoBERTa, it doesn't have :obj:`token_type_ids`. You don't need to indicate which
+  token belongs to which segment. Just separate your segments with the separation token :obj:`tokenizer.sep_token` (or
+  :obj:`</s>`).
+
 The Authors' code can be found `here <https://github.com/allenai/longformer>`__.

 Longformer Self Attention
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Longformer self attention employs self attention on both a "local" context and a "global" context.
-Most tokens only attend "locally" to each other meaning that each token attends to its :math:`\frac{1}{2} w` previous
-tokens and :math:`\frac{1}{2} w` succeding tokens with :math:`w` being the window length as defined in
+Longformer self attention employs self attention on both a "local" context and a "global" context. Most tokens only
+attend "locally" to each other meaning that each token attends to its :math:`\frac{1}{2} w` previous tokens and
+:math:`\frac{1}{2} w` succeding tokens with :math:`w` being the window length as defined in
 :obj:`config.attention_window`. Note that :obj:`config.attention_window` can be of type :obj:`List` to define a
 different :math:`w` for each layer. A selected few tokens attend "globally" to all other tokens, as it is
 conventionally done for all tokens in :obj:`BertSelfAttention`.

-Note that "locally" and "globally" attending tokens are projected by different query, key and value matrices.
-Also note that every "locally" attending token not only attends to tokens within its window :math:`w`, but also to all
-"globally" attending tokens so that global attention is *symmetric*.
+Note that "locally" and "globally" attending tokens are projected by different query, key and value matrices. Also note
+that every "locally" attending token not only attends to tokens within its window :math:`w`, but also to all "globally"
+attending tokens so that global attention is *symmetric*.

 The user can define which tokens attend "locally" and which tokens attend "globally" by setting the tensor
 :obj:`global_attention_mask` at run-time appropriately. All Longformer models employ the following logic for
-:obj:`global_attention_mask`: 
+:obj:`global_attention_mask`:

 - 0: the token attends "locally",
 - 1: the token attends "globally".
@@ -90,6 +108,50 @@ LongformerTokenizerFast
 .. autoclass:: transformers.LongformerTokenizerFast
    :members: 

+Longformer specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerBaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerBaseModelOutputWithPooling
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerMaskedLMOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerQuestionAnsweringModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerSequenceClassifierOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerMultipleChoiceModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_longformer.LongformerTokenClassifierOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerBaseModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerBaseModelOutputWithPooling
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerMaskedLMOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerQuestionAnsweringModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerSequenceClassifierOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerMultipleChoiceModelOutput
+    :members: 
+
+.. autoclass:: transformers.models.longformer.modeling_tf_longformer.TFLongformerTokenClassifierOutput
+    :members: 

 LongformerModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -153,3 +215,24 @@ TFLongformerForQuestionAnswering
 .. autoclass:: transformers.TFLongformerForQuestionAnswering
    :members: call

+
+TFLongformerForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerForSequenceClassification
+    :members: call
+
+
+TFLongformerForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerForTokenClassification
+    :members: call
+
+
+TFLongformerForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFLongformerForMultipleChoice
+    :members: call
+
--- a/docs/source/model_doc/lxmert.rst
+++ b/docs/source/model_doc/lxmert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 LXMERT
 -----------------------------------------------------------------------------------------------------------------------

@@ -8,9 +20,8 @@ The LXMERT model was proposed in `LXMERT: Learning Cross-Modality Encoder Repres
 <https://arxiv.org/abs/1908.07490>`__ by Hao Tan & Mohit Bansal. It is a series of bidirectional transformer encoders
 (one for the vision modality, one for the language modality, and then one to fuse both modalities) pretrained using a
 combination of masked language modeling, visual-language text alignment, ROI-feature regression, masked
-visual-attribute modeling, masked visual-object modeling, and visual-question answering objectives.
-The pretraining consists of multiple multi-modal datasets: MSCOCO, Visual-Genome + Visual-Genome Question Answering,
-VQA 2.0, and GQA.
+visual-attribute modeling, masked visual-object modeling, and visual-question answering objectives. The pretraining
+consists of multiple multi-modal datasets: MSCOCO, Visual-Genome + Visual-Genome Question Answering, VQA 2.0, and GQA.

 The abstract from the paper is the following:

@@ -20,7 +31,7 @@ Encoder Representations from Transformers) framework to learn these vision-and-l
 build a large-scale Transformer model that consists of three encoders: an object relationship encoder, a language
 encoder, and a cross-modality encoder. Next, to endow our model with the capability of connecting vision and language
 semantics, we pre-train the model with large amounts of image-and-sentence pairs, via five diverse representative
-pre-training tasks: masked language modeling, masked object prediction (feature regression and label classification),
+pretraining tasks: masked language modeling, masked object prediction (feature regression and label classification),
 cross-modality matching, and image question answering. These tasks help in learning both intra-modality and
 cross-modality relationships. After fine-tuning from our pretrained parameters, our model achieves the state-of-the-art
 results on two visual question answering datasets (i.e., VQA and GQA). We also show the generalizability of our
@@ -68,19 +79,19 @@ LxmertTokenizerFast
 Lxmert specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_lxmert.LxmertModelOutput
+.. autoclass:: transformers.models.lxmert.modeling_lxmert.LxmertModelOutput
    :members:

-.. autoclass:: transformers.modeling_lxmert.LxmertForPreTrainingOutput
+.. autoclass:: transformers.models.lxmert.modeling_lxmert.LxmertForPreTrainingOutput
    :members:

-.. autoclass:: transformers.modeling_lxmert.LxmertForQuestionAnsweringOutput
+.. autoclass:: transformers.models.lxmert.modeling_lxmert.LxmertForQuestionAnsweringOutput
    :members:

-.. autoclass:: transformers.modeling_tf_lxmert.TFLxmertModelOutput
+.. autoclass:: transformers.models.lxmert.modeling_tf_lxmert.TFLxmertModelOutput
    :members:

-.. autoclass:: transformers.modeling_tf_lxmert.TFLxmertForPreTrainingOutput
+.. autoclass:: transformers.models.lxmert.modeling_tf_lxmert.TFLxmertForPreTrainingOutput
    :members:


--- a/docs/source/model_doc/marian.rst
+++ b/docs/source/model_doc/marian.rst
@@ -1,42 +1,152 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 MarianMT
 -----------------------------------------------------------------------------------------------------------------------
-**Bugs:** If you see something strange,
-file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title>`__ and assign
-@sshleifer. Translations should be similar, but not identical to, output in the test set linked to in each model card.
+
+**Bugs:** If you see something strange, file a `Github Issue
+<https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title>`__
+and assign @patrickvonplaten.
+
+Translations should be similar, but not identical to output in the test set linked to in each model card.

 Implementation Notes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Each model is about 298 MB on disk, there are 1,000+ models.
+
+- Each model is about 298 MB on disk, there are more than 1,000 models.
 - The list of supported language pairs can be found `here <https://huggingface.co/Helsinki-NLP>`__.
- models were originally trained by `Jörg Tiedemann <https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann>`__ using the `Marian <https://marian-nmt.github.io/>`_ C++ library, which supports fast training and translation.
- All models are transformer encoder-decoders with 6 layers in each component. Each model's performance is documented in a model card.
+- Models were originally trained by `Jörg Tiedemann
+  <https://researchportal.helsinki.fi/en/persons/j%C3%B6rg-tiedemann>`__ using the `Marian
+  <https://marian-nmt.github.io/>`__ C++ library, which supports fast training and translation.
+- All models are transformer encoder-decoders with 6 layers in each component. Each model's performance is documented
+  in a model card.
 - The 80 opus models that require BPE preprocessing are not supported.
- The modeling code is the same as ``BartForConditionalGeneration`` with a few minor modifications:
-    - static (sinusoid) positional embeddings (``MarianConfig.static_position_embeddings=True``)
-    - a new final_logits_bias (``MarianConfig.add_bias_logits=True``)
-    - no layernorm_embedding (``MarianConfig.normalize_embedding=False``)
-    - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix. (Bart uses <s/>)
- Code to bulk convert models can be found in ``convert_marian_to_pytorch.py``
+- The modeling code is the same as :class:`~transformers.BartForConditionalGeneration` with a few minor modifications:
+
+    - static (sinusoid) positional embeddings (:obj:`MarianConfig.static_position_embeddings=True`)
+    - no layernorm_embedding (:obj:`MarianConfig.normalize_embedding=False`)
+    - the model starts generating with :obj:`pad_token_id` (which has 0 as a token_embedding) as the prefix (Bart uses
+      :obj:`<s/>`),
+- Code to bulk convert models can be found in ``convert_marian_to_pytorch.py``.

 Naming
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- All  model names use the following format: ``Helsinki-NLP/opus-mt-{src}-{tgt}``
- The language codes used to name models are inconsistent. Two digit codes can usually be found `here <https://developers.google.com/admin-sdk/directory/v1/languages>`_, three digit codes require googling "language code {code}".
- Codes formatted like ``es_AR`` are usually ``code_{region}``. That one is spanish documents from Argentina.

+- All model names use the following format: :obj:`Helsinki-NLP/opus-mt-{src}-{tgt}`
+- The language codes used to name models are inconsistent. Two digit codes can usually be found `here
+  <https://developers.google.com/admin-sdk/directory/v1/languages>`__, three digit codes require googling "language
+  code {code}".
+- Codes formatted like :obj:`es_AR` are usually :obj:`code_{region}`. That one is Spanish from Argentina.
+- The models were converted in two stages. The first 1000 models use ISO-639-2 codes to identify languages, the second
+  group use a combination of ISO-639-5 codes and ISO-639-2 codes.
+
+
+Examples
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+- Since Marian models are smaller than many other translation models available in the library, they can be useful for
+  fine-tuning experiments and integration tests.
+- `Fine-tune on GPU
+  <https://github.com/huggingface/transformers/blob/master/examples/research_projects/seq2seq-distillation/train_distil_marian_enro_teacher.sh>`__
+- `Fine-tune on GPU with pytorch-lightning
+  <https://github.com/huggingface/transformers/blob/master/examples/research_projects/seq2seq-distillation/train_distil_marian_no_teacher.sh>`__

 Multilingual Models
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-All  model names use the following format: ``Helsinki-NLP/opus-mt-{src}-{tgt}``:
-    - if ``src`` is in all caps, the model supports multiple input languages, you can figure out which ones by looking at the model card, or the Group Members `mapping <https://gist.github.com/sshleifer/6d20e7761931b08e73c3219027b97b8a>`_ .
-    - if ``tgt`` is in all caps, the model can output multiple languages, and you should specify a language code by prepending the desired output language to the src_text
-    - You can see a tokenizer's supported language codes in ``tokenizer.supported_language_codes``
+- All model names use the following format: :obj:`Helsinki-NLP/opus-mt-{src}-{tgt}`:
+- If a model can output multiple languages, and you should specify a language code by prepending the desired output
+  language to the :obj:`src_text`.
+- You can see a models's supported language codes in its model card, under target constituents, like in `opus-mt-en-roa
+  <https://huggingface.co/Helsinki-NLP/opus-mt-en-roa>`__.
+- Note that if a model is only multilingual on the source side, like :obj:`Helsinki-NLP/opus-mt-roa-en`, no language
+  codes are required.

-Example of translating english to many romance languages, using language codes:
+New multi-lingual models from the `Tatoeba-Challenge repo <https://github.com/Helsinki-NLP/Tatoeba-Challenge>`__
+require 3 character language codes:

 .. code-block:: python

+    from transformers import MarianMTModel, MarianTokenizer
+    src_text = [
+        '>>fra<< this is a sentence in english that we want to translate to french',
+        '>>por<< This should go to portuguese',
+        '>>esp<< And this to Spanish'
+    ]
+
+    model_name = 'Helsinki-NLP/opus-mt-en-roa'
+    tokenizer = MarianTokenizer.from_pretrained(model_name)
+    print(tokenizer.supported_language_codes)
+    model = MarianMTModel.from_pretrained(model_name)
+    translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
+    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
+    # ["c'est une phrase en anglais que nous voulons traduire en français",
+    # 'Isto deve ir para o português.',
+    # 'Y esto al español']
+
+
+
+
+Code to see available pretrained models:
+
+.. code-block:: python
+
+    from transformers.hf_api import HfApi
+    model_list = HfApi().model_list()
+    org = "Helsinki-NLP"
+    model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
+    suffix = [x.split('/')[1] for x in model_ids]
+    old_style_multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]
+
+
+
+Old Style Multi-Lingual Models
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+These are the old style multi-lingual models ported from the OPUS-MT-Train repo: and the members of each language
+group:
+
+.. code-block:: python
+
+    ['Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU',
+     'Helsinki-NLP/opus-mt-ROMANCE-en',
+     'Helsinki-NLP/opus-mt-SCANDINAVIA-SCANDINAVIA',
+     'Helsinki-NLP/opus-mt-de-ZH',
+     'Helsinki-NLP/opus-mt-en-CELTIC',
+     'Helsinki-NLP/opus-mt-en-ROMANCE',
+     'Helsinki-NLP/opus-mt-es-NORWAY',
+     'Helsinki-NLP/opus-mt-fi-NORWAY',
+     'Helsinki-NLP/opus-mt-fi-ZH',
+     'Helsinki-NLP/opus-mt-fi_nb_no_nn_ru_sv_en-SAMI',
+     'Helsinki-NLP/opus-mt-sv-NORWAY',
+     'Helsinki-NLP/opus-mt-sv-ZH']
+    GROUP_MEMBERS = {
+     'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'],
+     'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'],
+     'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+     'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
+     'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'],
+     'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'],
+     'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv']
+    }
+
+
+
+
+Example of translating english to many romance languages, using old-style 2 character language codes
+
+
+.. code-block::python
+
    from transformers import MarianMTModel, MarianTokenizer
    src_text = [
        '>>fr<< this is a sentence in english that we want to translate to french',
@@ -47,55 +157,17 @@ Example of translating english to many romance languages, using language codes:
    model_name = 'Helsinki-NLP/opus-mt-en-ROMANCE'
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    print(tokenizer.supported_language_codes)
+
    model = MarianMTModel.from_pretrained(model_name)
-    translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text))
+    translated = model.generate(**tokenizer.prepare_seq2seq_batch(src_text, return_tensors="pt"))
    tgt_text = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]
-    # ["c'est une phrase en anglais que nous voulons traduire en français",
-    # 'Isto deve ir para o português.',
-    # 'Y esto al español']
-
-Sometimes, models were trained on collections of languages that do not resolve to a group. In this case, _ is used as a separator for src or tgt, as in ``'Helsinki-NLP/opus-mt-en_el_es_fi-en_el_es_fi'``. These still require language codes.
-There are many supported regional language codes, like ``>>es_ES<<`` (Spain) and ``>>es_AR<<`` (Argentina), that do not seem to change translations. I have not found these to provide different results than just using ``>>es<<``.
-
-For Example:
-    - ``Helsinki-NLP/opus-mt-NORTH_EU-NORTH_EU``: translates from all NORTH_EU languages (see `mapping <https://gist.github.com/sshleifer/6d20e7761931b08e73c3219027b97b8a>`_) to all NORTH_EU languages. Use a special language code like ``>>de<<`` to specify output language.
-    - ``Helsinki-NLP/opus-mt-ROMANCE-en``: translates from many romance languages to english, no codes needed since there is only 1 tgt language.
+    # ["c'est une phrase en anglais que nous voulons traduire en français", 'Isto deve ir para o português.',  'Y esto al español']



-.. code-block:: python
-
-    GROUP_MEMBERS = {
-     'ZH': ['cmn', 'cn', 'yue', 'ze_zh', 'zh_cn', 'zh_CN', 'zh_HK', 'zh_tw', 'zh_TW', 'zh_yue', 'zhs', 'zht', 'zh'],
-     'ROMANCE': ['fr', 'fr_BE', 'fr_CA', 'fr_FR', 'wa', 'frp', 'oc', 'ca', 'rm', 'lld', 'fur', 'lij', 'lmo', 'es', 'es_AR', 'es_CL', 'es_CO', 'es_CR', 'es_DO', 'es_EC', 'es_ES', 'es_GT', 'es_HN', 'es_MX', 'es_NI', 'es_PA', 'es_PE', 'es_PR', 'es_SV', 'es_UY', 'es_VE', 'pt', 'pt_br', 'pt_BR', 'pt_PT', 'gl', 'lad', 'an', 'mwl', 'it', 'it_IT', 'co', 'nap', 'scn', 'vec', 'sc', 'ro', 'la'],
-     'NORTH_EU': ['de', 'nl', 'fy', 'af', 'da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
-     'SCANDINAVIA': ['da', 'fo', 'is', 'no', 'nb', 'nn', 'sv'],
-     'SAMI': ['se', 'sma', 'smj', 'smn', 'sms'],
-     'NORWAY': ['nb_NO', 'nb', 'nn_NO', 'nn', 'nog', 'no_nb', 'no'],
-     'CELTIC': ['ga', 'cy', 'br', 'gd', 'kw', 'gv']
-    }
-
-Code to see available pretrained models:
-
-.. code-block:: python
-
-    from transformers.hf_api import HfApi
-    model_list = HfApi().model_list()
-    org = "Helsinki-NLP"
-    model_ids = [x.modelId for x in model_list if x.modelId.startswith(org)]
-    suffix = [x.split('/')[1] for x in model_ids]
-    multi_models = [f'{org}/{s}' for s in suffix if s != s.lower()]
-
-MarianMTModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-Pytorch version of marian-nmt's transformer.h (c++). Designed for the OPUS-NMT translation checkpoints.
-Model API is identical to BartForConditionalGeneration.
-Available models are listed at `Model List <https://huggingface.co/models?search=Helsinki-NLP>`__
-This class inherits nearly all functionality from ``BartForConditionalGeneration``, see that page for method signatures.
-
 MarianConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 .. autoclass:: transformers.MarianConfig
    :members:

@@ -107,5 +179,29 @@ MarianTokenizer
    :members: prepare_seq2seq_batch


+MarianModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MarianModel
+    :members: forward


+MarianMTModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MarianMTModel
+    :members: forward
+
+
+TFMarianModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMarianModel
+    :members: call
+
+
+TFMarianMTModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMarianMTModel
+    :members: call
--- a/docs/source/model_doc/mbart.rst
+++ b/docs/source/model_doc/mbart.rst
@@ -1,27 +1,54 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 MBart
 -----------------------------------------------------------------------------------------------------------------------
-**DISCLAIMER:** If you see something strange,
-file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
-@sshleifer
+
+**DISCLAIMER:** If you see something strange, file a `Github Issue
+<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+@patrickvonplaten

 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-The MBart model was presented in `Multilingual Denoising Pre-training for Neural Machine Translation <https://arxiv.org/abs/2001.08210>`_ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov
-Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer. According to the abstract,

-MBART is a sequence-to-sequence denoising auto-encoder pre-trained on large-scale monolingual corpora in many languages using the BART objective. mBART is one of the first methods for pre-training a complete sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only on the encoder, decoder, or reconstructing parts of the text.
+The MBart model was presented in `Multilingual Denoising Pre-training for Neural Machine Translation
+<https://arxiv.org/abs/2001.08210>`_ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan
+Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+
+According to the abstract, MBART is a sequence-to-sequence denoising auto-encoder pretrained on large-scale monolingual
+corpora in many languages using the BART objective. mBART is one of the first methods for pretraining a complete
+sequence-to-sequence model by denoising full texts in multiple languages, while previous approaches have focused only
+on the encoder, decoder, or reconstructing parts of the text.

 The Authors' code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/mbart>`__

+Examples
+_______________________________________________________________________________________________________________________
+
+- Examples and scripts for fine-tuning mBART and other models for sequence to sequence tasks can be found in
+  :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
+- Given the large embeddings table, mBART consumes a large amount of GPU RAM, especially for fine-tuning.
+  :class:`MarianMTModel` is usually a better choice for bilingual machine translation.

 Training
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-MBart is a multilingual encoder-decoder (seq-to-seq) model primarily intended for translation task. 
-As the model is multilingual it expects the sequences in a different format. A special language id token 
-is added in both the source and target text. The source text format is ``X [eos, src_lang_code]`` 
-where ``X`` is the source text. The target text format is ```[tgt_lang_code] X [eos]```. ```bos``` is never used.
-The ```MBartTokenizer.prepare_seq2seq_batch``` handles this automatically and should be used to encode 
-the sequences for seq-2-seq fine-tuning.
+
+MBart is a multilingual encoder-decoder (seq-to-seq) model primarily intended for translation task. As the model is
+multilingual it expects the sequences in a different format. A special language id token is added in both the source
+and target text. The source text format is :obj:`X [eos, src_lang_code]` where :obj:`X` is the source text. The target
+text format is :obj:`[tgt_lang_code] X [eos]`. :obj:`bos` is never used.
+
+The :meth:`~transformers.MBartTokenizer.prepare_seq2seq_batch` handles this automatically and should be used to encode
+the sequences for sequence-to-sequence fine-tuning.

 - Supervised training

@@ -29,17 +56,13 @@ the sequences for seq-2-seq fine-tuning.

    example_english_phrase = "UN Chief Says There Is No Military Solution in Syria"
    expected_translation_romanian = "Şeful ONU declară că nu există o soluţie militară în Siria"
-    batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian)
-    input_ids = batch["input_ids"]
-    target_ids = batch["decoder_input_ids"]
-    decoder_input_ids = target_ids[:, :-1].contiguous()
-    labels = target_ids[:, 1:].clone()
-    model(input_ids=input_ids, decoder_input_ids=decoder_input_ids, labels=labels) #forward
+    batch = tokenizer.prepare_seq2seq_batch(example_english_phrase, src_lang="en_XX", tgt_lang="ro_RO", tgt_texts=expected_translation_romanian, return_tensors="pt")
+    model(input_ids=batch['input_ids'], labels=batch['labels']) # forward pass

 - Generation

-    While generating the target text set the `decoder_start_token_id` to the target language id. 
-    The following example shows how to translate English to Romanian using the ```facebook/mbart-large-en-ro``` model.
+    While generating the target text set the :obj:`decoder_start_token_id` to the target language id. The following
+    example shows how to translate English to Romanian using the `facebook/mbart-large-en-ro` model.

 .. code-block::

@@ -47,7 +70,7 @@ the sequences for seq-2-seq fine-tuning.
    model = MBartForConditionalGeneration.from_pretrained("facebook/mbart-large-en-ro")
    tokenizer = MBartTokenizer.from_pretrained("facebook/mbart-large-en-ro")
    article = "UN Chief Says There Is No Military Solution in Syria"
-    batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang="en_XX")
+    batch = tokenizer.prepare_seq2seq_batch(src_texts=[article], src_lang="en_XX", return_tensors="pt")
    translated_tokens = model.generate(**batch, decoder_start_token_id=tokenizer.lang_code_to_id["ro_RO"])
    translation = tokenizer.batch_decode(translated_tokens, skip_special_tokens=True)[0]
    assert translation == "Şeful ONU declară că nu există o soluţie militară în Siria"
@@ -67,10 +90,49 @@ MBartTokenizer
    :members: build_inputs_with_special_tokens, prepare_seq2seq_batch


+MBartTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartTokenizerFast
+    :members:
+
+
+MBartModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartModel
+    :members:
+
+
 MBartForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.MBartForConditionalGeneration
-    :members: generate, forward
+    :members:


+MBartForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartForQuestionAnswering
+    :members:
+
+
+MBartForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MBartForSequenceClassification
+
+
+TFMBartModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMBartModel
+    :members: call
+
+
+TFMBartForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMBartForConditionalGeneration
+    :members: call
--- a/docs/source/model_doc/mobilebert.rst
+++ b/docs/source/model_doc/mobilebert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 MobileBERT
 -----------------------------------------------------------------------------------------------------------------------

@@ -14,23 +26,23 @@ The abstract from the paper is the following:
 *Natural Language Processing (NLP) has recently achieved great success by using huge pre-trained models with hundreds
 of millions of parameters. However, these models suffer from heavy model sizes and high latency such that they cannot
 be deployed to resource-limited mobile devices. In this paper, we propose MobileBERT for compressing and accelerating
-the popular BERT model. Like the original BERT, MobileBERT is task-agnostic, that is, it can be generically applied
-to various downstream NLP tasks via simple fine-tuning. Basically, MobileBERT is a thin version of BERT_LARGE, while
-equipped with bottleneck structures and a carefully designed balance between self-attentions and feed-forward
-networks. To train MobileBERT, we first train a specially designed teacher model, an inverted-bottleneck incorporated
-BERT_LARGE model. Then, we conduct knowledge transfer from this teacher to MobileBERT. Empirical studies show that
-MobileBERT is 4.3x smaller and 5.5x faster than BERT_BASE while achieving competitive results on well-known
-benchmarks. On the natural language inference tasks of GLUE, MobileBERT achieves a GLUEscore o 77.7
-(0.6 lower than BERT_BASE), and 62 ms latency on a Pixel 4 phone. On the SQuAD v1.1/v2.0 question answering task,
-MobileBERT achieves a dev F1 score of 90.0/79.2 (1.5/2.1 higher than BERT_BASE).*
+the popular BERT model. Like the original BERT, MobileBERT is task-agnostic, that is, it can be generically applied to
+various downstream NLP tasks via simple fine-tuning. Basically, MobileBERT is a thin version of BERT_LARGE, while
+equipped with bottleneck structures and a carefully designed balance between self-attentions and feed-forward networks.
+To train MobileBERT, we first train a specially designed teacher model, an inverted-bottleneck incorporated BERT_LARGE
+model. Then, we conduct knowledge transfer from this teacher to MobileBERT. Empirical studies show that MobileBERT is
+4.3x smaller and 5.5x faster than BERT_BASE while achieving competitive results on well-known benchmarks. On the
+natural language inference tasks of GLUE, MobileBERT achieves a GLUEscore o 77.7 (0.6 lower than BERT_BASE), and 62 ms
+latency on a Pixel 4 phone. On the SQuAD v1.1/v2.0 question answering task, MobileBERT achieves a dev F1 score of
+90.0/79.2 (1.5/2.1 higher than BERT_BASE).*

 Tips:

- MobileBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on
-  the right rather than the left.
- MobileBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective.
-  It is therefore efficient at predicting masked tokens and at NLU in general, but is not optimal for
-  text generation. Models trained with a causal language modeling (CLM) objective are better in that regard.
+- MobileBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right rather
+  than the left.
+- MobileBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore
+  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
+  with a causal language modeling (CLM) objective are better in that regard.

 The original code can be found `here <https://github.com/google-research/mobilebert>`__.

@@ -58,10 +70,10 @@ MobileBertTokenizerFast
 MobileBert specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_mobilebert.MobileBertForPreTrainingOutput
+.. autoclass:: transformers.models.mobilebert.modeling_mobilebert.MobileBertForPreTrainingOutput
    :members:

-.. autoclass:: transformers.modeling_tf_mobilebert.TFMobileBertForPreTrainingOutput
+.. autoclass:: transformers.models.mobilebert.modeling_tf_mobilebert.TFMobileBertForPreTrainingOutput
    :members:


--- a/docs/source/model_doc/mpnet.rst
+++ b/docs/source/model_doc/mpnet.rst
@@ -0,0 +1,149 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+MPNet
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The MPNet model was proposed in `MPNet: Masked and Permuted Pre-training for Language Understanding
+<https://arxiv.org/abs/2004.09297>`__ by Kaitao Song, Xu Tan, Tao Qin, Jianfeng Lu, Tie-Yan Liu.
+
+MPNet adopts a novel pre-training method, named masked and permuted language modeling, to inherit the advantages of
+masked language modeling and permuted language modeling for natural language understanding.
+
+The abstract from the paper is the following:
+
+*BERT adopts masked language modeling (MLM) for pre-training and is one of the most successful pre-training models.
+Since BERT neglects dependency among predicted tokens, XLNet introduces permuted language modeling (PLM) for
+pre-training to address this problem. However, XLNet does not leverage the full position information of a sentence and
+thus suffers from position discrepancy between pre-training and fine-tuning. In this paper, we propose MPNet, a novel
+pre-training method that inherits the advantages of BERT and XLNet and avoids their limitations. MPNet leverages the
+dependency among predicted tokens through permuted language modeling (vs. MLM in BERT), and takes auxiliary position
+information as input to make the model see a full sentence and thus reducing the position discrepancy (vs. PLM in
+XLNet). We pre-train MPNet on a large-scale dataset (over 160GB text corpora) and fine-tune on a variety of
+down-streaming tasks (GLUE, SQuAD, etc). Experimental results show that MPNet outperforms MLM and PLM by a large
+margin, and achieves better results on these tasks compared with previous state-of-the-art pre-trained methods (e.g.,
+BERT, XLNet, RoBERTa) under the same model setting.*
+
+Tips:
+
+- MPNet doesn't have :obj:`token_type_ids`, you don't need to indicate which token belongs to which segment. just
+  separate your segments with the separation token :obj:`tokenizer.sep_token` (or :obj:`[sep]`).
+
+The original code can be found `here <https://github.com/microsoft/MPNet>`__.
+
+MPNetConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetConfig
+    :members:
+
+
+MPNetTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+MPNetTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetTokenizerFast
+    :members:
+
+
+MPNetModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetModel
+    :members: forward
+
+
+MPNetForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForMaskedLM
+    :members: forward
+
+
+MPNetForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForSequenceClassification
+    :members: forward
+
+
+MPNetForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForMultipleChoice
+    :members: forward
+
+
+MPNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForTokenClassification
+    :members: forward
+
+
+MPNetForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MPNetForQuestionAnswering
+    :members: forward
+
+
+TFMPNetModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetModel
+    :members: call
+
+
+TFMPNetForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForMaskedLM
+    :members: call
+
+
+TFMPNetForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForSequenceClassification
+    :members: call
+
+
+TFMPNetForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForMultipleChoice
+    :members: call
+
+
+TFMPNetForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForTokenClassification
+    :members: call
+
+
+TFMPNetForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMPNetForQuestionAnswering
+    :members: call
--- a/docs/source/model_doc/mt5.rst
+++ b/docs/source/model_doc/mt5.rst
@@ -0,0 +1,95 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+MT5
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The mT5 model was presented in `mT5: A massively multilingual pre-trained text-to-text transformer
+<https://arxiv.org/abs/2010.11934>`_ by Linting Xue, Noah Constant, Adam Roberts, Mihir Kale, Rami Al-Rfou, Aditya
+Siddhant, Aditya Barua, Colin Raffel.
+
+The abstract from the paper is the following:
+
+*The recent "Text-to-Text Transfer Transformer" (T5) leveraged a unified text-to-text format and scale to attain
+state-of-the-art results on a wide variety of English-language NLP tasks. In this paper, we introduce mT5, a
+multilingual variant of T5 that was pre-trained on a new Common Crawl-based dataset covering 101 languages. We describe
+the design and modified training of mT5 and demonstrate its state-of-the-art performance on many multilingual
+benchmarks. All of the code and model checkpoints*
+
+The original code can be found `here <https://github.com/google-research/multilingual-t5>`__.
+
+MT5Config
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5Config
+    :members:
+
+
+MT5Tokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5Tokenizer
+
+See :class:`~transformers.T5Tokenizer` for all details.
+
+
+MT5TokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5TokenizerFast
+
+See :class:`~transformers.T5TokenizerFast` for all details.
+
+
+MT5Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5Model
+    :members:
+
+
+MT5ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5ForConditionalGeneration
+    :members:
+
+
+MT5EncoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.MT5EncoderModel
+    :members:
+
+
+TFMT5Model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMT5Model
+    :members:
+
+
+TFMT5ForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMT5ForConditionalGeneration
+    :members:
+
+
+TFMT5EncoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFMT5EncoderModel
+    :members:
--- a/docs/source/model_doc/pegasus.rst
+++ b/docs/source/model_doc/pegasus.rst
@@ -1,45 +1,76 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Pegasus
 -----------------------------------------------------------------------------------------------------------------------
-**DISCLAIMER:** If you see something strange,
-file a `Github Issue <https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title>`__ and assign
-@sshleifer.
+
+**DISCLAIMER:** If you see something strange, file a `Github Issue
+<https://github.com/huggingface/transformers/issues/new?assignees=sshleifer&labels=&template=bug-report.md&title>`__
+and assign @patrickvonplaten.


 Overview
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-The Pegasus model was proposed in `PEGASUS: Pre-training with Extracted Gap-sentences for
-Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`_ by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
+The Pegasus model was proposed in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive Summarization
+<https://arxiv.org/pdf/1912.08777.pdf>`__ by Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.
+
 According to the abstract,

- Pegasus' pretraining task is intentionally similar to summarization: important sentences are removed/masked from an input document and are generated together as one output sequence from the remaining sentences, similar to an extractive summary.
+- Pegasus' pretraining task is intentionally similar to summarization: important sentences are removed/masked from an
+  input document and are generated together as one output sequence from the remaining sentences, similar to an
+  extractive summary.
 - Pegasus achieves SOTA summarization performance on all 12 downstream tasks, as measured by ROUGE and human eval.

-The Authors' code can be found `here <https://github.com/google-research/pegasus>`_.
+The Authors' code can be found `here <https://github.com/google-research/pegasus>`__.


 Checkpoints
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-All the `checkpoints <https://huggingface.co/models?search=pegasus>`_ are finetuned for summarization, besides ``pegasus-large``, whence the other checkpoints are finetuned.
+
+All the `checkpoints <https://huggingface.co/models?search=pegasus>`__ are fine-tuned for summarization, besides
+`pegasus-large`, whence the other checkpoints are fine-tuned:
+
 - Each checkpoint is 2.2 GB on disk and 568M parameters.
 - FP16 is not supported (help/ideas on this appreciated!).
 - Summarizing xsum in fp32 takes about 400ms/sample, with default parameters on a v100 GPU.
- For XSUM, The paper reports rouge1,rouge2, rougeL of paper: 47.21/24.56/39.25. As of Aug 9, this port scores 46.91/24.34/39.1.
-The gap is likely because of different alpha/length_penalty implementations in beam search.
+- Full replication results and correctly pre-processed data can be found in this `Issue
+  <https://github.com/huggingface/transformers/issues/6844#issue-689259666>`__.
+- `Distilled checkpoints <https://huggingface.co/models?search=distill-pegasus>`__ are described in this `paper
+  <https://arxiv.org/abs/2010.13002>`__.
+
+Examples
+_______________________________________________________________________________________________________________________
+
+- :prefix_link:`Script <examples/seq2seq/finetune_pegasus_xsum.sh>` to fine-tune pegasus on the XSUM dataset. Data
+  download instructions at :prefix_link:`examples/seq2seq/ <examples/seq2seq/README.md>`.
+- FP16 is not supported (help/ideas on this appreciated!).
+- The adafactor optimizer is recommended for pegasus fine-tuning.


 Implementation Notes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 - All models are transformer encoder-decoders with 16 layers in each component.
- The implementation is completely inherited from ``BartForConditionalGeneration``
+- The implementation is completely inherited from :class:`~transformers.BartForConditionalGeneration`
 - Some key configuration differences:
+
    - static, sinusoidal position embeddings
-    - no ``layernorm_embedding`` (``PegasusConfig.normalize_embedding=False``)
    - the model starts generating with pad_token_id (which has 0 token_embedding) as the prefix.
-    - ``num_beams=8``
- All pretrained pegasus checkpoints are the same besides three attributes: ``tokenizer.model_max_length`` (max input size),  ``max_length`` (max num tokens to generate) and ``length_penalty``
- Code to convert checkpoints trained in the author's `repo <https://github.com/google-research/pegasus>`_ can be found in ``convert_pegasus_tf_to_pytorch.py``
+    - more beams are used (:obj:`num_beams=8`)
+- All pretrained pegasus checkpoints are the same besides three attributes: :obj:`tokenizer.model_max_length` (maximum
+  input size), :obj:`max_length` (the maximum number of tokens to generate) and :obj:`length_penalty`.
+- The code to convert checkpoints trained in the author's `repo <https://github.com/google-research/pegasus>`_ can be
+  found in ``convert_pegasus_tf_to_pytorch.py``.


 Usage Example
@@ -57,61 +88,58 @@ Usage Example
    torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
    tokenizer = PegasusTokenizer.from_pretrained(model_name)
    model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
-    batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest').to(torch_device)
+    batch = tokenizer.prepare_seq2seq_batch(src_text, truncation=True, padding='longest', return_tensors="pt").to(torch_device)
    translated = model.generate(**batch)
    tgt_text = tokenizer.batch_decode(translated, skip_special_tokens=True)
    assert tgt_text[0] == "California's largest electricity provider has turned off power to hundreds of thousands of customers."

-PegasusForConditionalGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-This class inherits all functionality from ``BartForConditionalGeneration``, see that page for method signatures.
-Available models are listed at `Model List <https://huggingface.co/models?search=pegasus>`__
-
-.. autoclass:: transformers.PegasusForConditionalGeneration
-    :members:


 PegasusConfig
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-This config fully inherits from ``BartConfig``, but pegasus uses different default values:
-Up to date parameter values can be seen in `S3 <https://s3.amazonaws.com/models.huggingface.co/bert/google/pegasus-xsum/config.json>`_.
-As of Aug 10, 2020, they are:

-.. code-block:: python
-
-    dict(
-    vocab_size=96103,
-    max_position_embeddings=512,
-    d_model=1024,
-    encoder_ffn_dim=4096,
-    decoder_ffn_dim=4096,
-    encoder_attention_heads=16,
-    decoder_attention_heads=16,
-    encoder_layers=16,
-    decoder_layers=16,
-    dropout=0.1,
-    attention_dropout=0.1,
-    activation_dropout=0.1,
-    pad_token_id=0,
-    eos_token_id=1,
-    is_encoder_decoder=True,
-    normalize_before=True,
-    scale_embedding=True,
-    normalize_embedding=False,
-    add_final_layer_norm=True,
-    static_position_embeddings=True,
-    num_beams=8,
-    activation_function="relu",
-    )
+.. autoclass:: transformers.PegasusConfig


 PegasusTokenizer
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 warning: ``add_tokens`` does not work at the moment.

 .. autoclass:: transformers.PegasusTokenizer
    :members: __call__, prepare_seq2seq_batch


+PegasusTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

+.. autoclass:: transformers.PegasusTokenizerFast
+    :members:
+
+
+PegasusModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PegasusModel
+    :members: forward
+
+
+PegasusForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PegasusForConditionalGeneration
+    :members: forward
+
+
+TFPegasusModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFPegasusModel
+    :members: call
+
+
+TFPegasusForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFPegasusForConditionalGeneration
+    :members: call
--- a/docs/source/model_doc/phobert.rst
+++ b/docs/source/model_doc/phobert.rst
@@ -0,0 +1,59 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+PhoBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The PhoBERT model was proposed in `PhoBERT: Pre-trained language models for Vietnamese
+<https://www.aclweb.org/anthology/2020.findings-emnlp.92.pdf>`__ by Dat Quoc Nguyen, Anh Tuan Nguyen.
+
+The abstract from the paper is the following:
+
+*We present PhoBERT with two versions, PhoBERT-base and PhoBERT-large, the first public large-scale monolingual
+language models pre-trained for Vietnamese. Experimental results show that PhoBERT consistently outperforms the recent
+best pre-trained multilingual model XLM-R (Conneau et al., 2020) and improves the state-of-the-art in multiple
+Vietnamese-specific NLP tasks including Part-of-speech tagging, Dependency parsing, Named-entity recognition and
+Natural language inference.*
+
+Example of use:
+
+.. code-block::
+
+  import torch
+  from transformers import AutoModel, AutoTokenizer
+
+  phobert = AutoModel.from_pretrained("vinai/phobert-base")
+  tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
+
+  # INPUT TEXT MUST BE ALREADY WORD-SEGMENTED!
+  line = "Tôi là sinh_viên trường đại_học Công_nghệ ."
+
+  input_ids = torch.tensor([tokenizer.encode(line)])
+
+  with torch.no_grad():
+      features = phobert(input_ids)  # Models outputs are now tuples
+
+  ## With TensorFlow 2.0+:
+  # from transformers import TFAutoModel
+  # phobert = TFAutoModel.from_pretrained("vinai/phobert-base")
+
+
+The original code can be found `here <https://github.com/VinAIResearch/PhoBERT>`__.
+
+PhobertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.PhobertTokenizer
+    :members: 
--- a/docs/source/model_doc/prophetnet.rst
+++ b/docs/source/model_doc/prophetnet.rst
@@ -0,0 +1,106 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+ProphetNet
+-----------------------------------------------------------------------------------------------------------------------
+
+**DISCLAIMER:** If you see something strange, file a `Github Issue
+<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+@patrickvonplaten
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The ProphetNet model was proposed in `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,
+<https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei
+Zhang, Ming Zhou on 13 Jan, 2020.
+
+ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of just
+the next token.
+
+The abstract from the paper is the following:
+
+*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
+self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
+the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
+n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
+step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent
+overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
+dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
+abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
+state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
+
+The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.
+
+
+ProphetNetConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ProphetNetConfig
+    :members:
+
+
+ProphetNetTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ProphetNetTokenizer
+    :members:
+
+
+ProphetNet specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqLMOutput
+    :members:
+
+.. autoclass:: transformers.models.prophetnet.modeling_prophetnet.ProphetNetSeq2SeqModelOutput
+    :members:
+
+.. autoclass:: transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderModelOutput
+    :members:
+
+.. autoclass:: transformers.models.prophetnet.modeling_prophetnet.ProphetNetDecoderLMOutput
+    :members:
+
+ProphetNetModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ProphetNetModel
+    :members: forward
+
+
+ProphetNetEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ProphetNetEncoder
+    :members: forward
+
+
+ProphetNetDecoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ProphetNetDecoder
+    :members: forward
+
+
+ProphetNetForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ProphetNetForConditionalGeneration
+    :members: forward
+
+
+ProphetNetForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ProphetNetForCausalLM
+    :members: forward
--- a/docs/source/model_doc/rag.rst
+++ b/docs/source/model_doc/rag.rst
@@ -1,8 +1,20 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 RAG
----------------------------------------------------
+-----------------------------------------------------------------------------------------------------------------------

 Overview
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and
 sequence-to-sequence models. RAG models retrieve documents, pass them to a seq2seq model, then marginalize to generate
@@ -15,77 +27,70 @@ Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäs

 The abstract from the paper is the following:

-*Large pre-trained language models have been shown to store factual knowledge
-in their parameters, and achieve state-of-the-art results when fine-tuned on
-downstream NLP tasks. However, their ability to access and precisely manipulate
-knowledge is still limited, and hence on knowledge-intensive tasks, their
-performance lags behind task-specific architectures. Additionally, providing
-provenance for their decisions and updating their world knowledge remain open
-research problems. Pre-trained models with a differentiable access mechanism to
-explicit nonparametric memory can overcome this issue, but have so far been only
-investigated for extractive downstream tasks. We explore a general-purpose
-fine-tuning recipe for retrieval-augmented generation (RAG) — models which combine
-pre-trained parametric and non-parametric memory for language generation. We
-introduce RAG models where the parametric memory is a pre-trained seq2seq model and
-the non-parametric memory is a dense vector index of Wikipedia, accessed with
-a pre-trained neural retriever. We compare two RAG formulations, one which
-conditions on the same retrieved passages across the whole generated sequence, the
-other can use different passages per token. We fine-tune and evaluate our models
-on a wide range of knowledge-intensive NLP tasks and set the state-of-the-art
-on three open domain QA tasks, outperforming parametric seq2seq models and
-task-specific retrieve-and-extract architectures. For language generation tasks, we
-find that RAG models generate more specific, diverse and factual language than a
-state-of-the-art parametric-only seq2seq baseline.*
+*Large pre-trained language models have been shown to store factual knowledge in their parameters, and achieve
+state-of-the-art results when fine-tuned on downstream NLP tasks. However, their ability to access and precisely
+manipulate knowledge is still limited, and hence on knowledge-intensive tasks, their performance lags behind
+task-specific architectures. Additionally, providing provenance for their decisions and updating their world knowledge
+remain open research problems. Pre-trained models with a differentiable access mechanism to explicit nonparametric
+memory can overcome this issue, but have so far been only investigated for extractive downstream tasks. We explore a
+general-purpose fine-tuning recipe for retrieval-augmented generation (RAG) — models which combine pre-trained
+parametric and non-parametric memory for language generation. We introduce RAG models where the parametric memory is a
+pre-trained seq2seq model and the non-parametric memory is a dense vector index of Wikipedia, accessed with a
+pre-trained neural retriever. We compare two RAG formulations, one which conditions on the same retrieved passages
+across the whole generated sequence, the other can use different passages per token. We fine-tune and evaluate our
+models on a wide range of knowledge-intensive NLP tasks and set the state-of-the-art on three open domain QA tasks,
+outperforming parametric seq2seq models and task-specific retrieve-and-extract architectures. For language generation
+tasks, we find that RAG models generate more specific, diverse and factual language than a state-of-the-art
+parametric-only seq2seq baseline.*



 RagConfig
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RagConfig
    :members:


 RagTokenizer
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RagTokenizer
    :members: prepare_seq2seq_batch


 Rag specific outputs
-~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_rag.RetrievAugLMMarginOutput
+.. autoclass:: transformers.models.rag.modeling_rag.RetrievAugLMMarginOutput
    :members:

-.. autoclass:: transformers.modeling_rag.RetrievAugLMOutput
+.. autoclass:: transformers.models.rag.modeling_rag.RetrievAugLMOutput
    :members:

-
-RAGRetriever
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+RagRetriever
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RagRetriever
    :members:


 RagModel
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RagModel
    :members: forward


 RagSequenceForGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RagSequenceForGeneration
    :members: forward, generate


 RagTokenForGeneration
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.RagTokenForGeneration
    :members: forward, generate
--- a/docs/source/model_doc/reformer.rst
+++ b/docs/source/model_doc/reformer.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Reformer
 -----------------------------------------------------------------------------------------------------------------------

@@ -10,7 +22,7 @@ Overview
 The Reformer model was proposed in the paper `Reformer: The Efficient Transformer
 <https://arxiv.org/abs/2001.04451.pdf>`__ by Nikita Kitaev, Łukasz Kaiser, Anselm Levskaya.

-The abstract from the paper is the following: 
+The abstract from the paper is the following:

 *Large Transformer models routinely achieve state-of-the-art results on a number of tasks but training these models can
 be prohibitively costly, especially on long sequences. We introduce two techniques to improve the efficiency of
@@ -36,12 +48,12 @@ would result in a position encoding matrix:
 .. math::
    X_{i,j}, \text{ with } i \in \left[1,\ldots, d\right] \text{ and } j \in \left[1,\ldots, n_s\right] 

-which alone has over 500M parameters to store. Axial positional encodings factorize :math:`X_{i,j}` into two matrices: 
+which alone has over 500M parameters to store. Axial positional encodings factorize :math:`X_{i,j}` into two matrices:

 .. math::
    X^{1}_{i,j}, \text{ with } i \in \left[1,\ldots, d^1\right] \text{ and } j \in \left[1,\ldots, n_s^1\right] 

-and 
+and

 .. math::
    X^{2}_{i,j}, \text{ with } i \in \left[1,\ldots, d^2\right] \text{ and } j \in \left[1,\ldots, n_s^2\right] 
@@ -67,22 +79,23 @@ factorized embedding vectors: :math:`x^1_{k, l} + x^2_{l, k}`, where as the :obj
 Using the above example again, axial position encoding with :math:`d^1 = 2^5, d^2 = 2^5, n_s^1 = 2^9, n_s^2 = 2^{10}`
 can drastically reduced the number of parameters to :math:`2^{14} + 2^{15} \approx 49000` parameters.

-In practice, the parameter :obj:`config.axial_pos_embds_dim` is set to a tuple :math:`(d^1, d^2)` which sum has to
-be equal to :obj:`config.hidden_size` and :obj:`config.axial_pos_shape` is set to a tuple :math:`(n_s^1, n_s^2)` which
-product has to be equal to :obj:`config.max_embedding_size`, which during training has to be equal to the
-`sequence length` of the :obj:`input_ids`.
+In practice, the parameter :obj:`config.axial_pos_embds_dim` is set to a tuple :math:`(d^1, d^2)` which sum has to be
+equal to :obj:`config.hidden_size` and :obj:`config.axial_pos_shape` is set to a tuple :math:`(n_s^1, n_s^2)` which
+product has to be equal to :obj:`config.max_embedding_size`, which during training has to be equal to the `sequence
+length` of the :obj:`input_ids`.


 LSH Self Attention
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
 In Locality sensitive hashing (LSH) self attention the key and query projection weights are tied. Therefore, the key
 query embedding vectors are also tied. LSH self attention uses the locality sensitive hashing mechanism proposed in
 `Practical and Optimal LSH for Angular Distance <https://arxiv.org/abs/1509.02897>`__ to assign each of the tied key
 query embedding vectors to one of :obj:`config.num_buckets` possible buckets. The premise is that the more "similar"
 key query embedding vectors (in terms of *cosine similarity*) are to each other, the more likely they are assigned to
-the same bucket. 
+the same bucket.

-The accuracy of the LSH mechanism can be improved by increasing :obj:`config.num_hashes` or directly the argument 
+The accuracy of the LSH mechanism can be improved by increasing :obj:`config.num_hashes` or directly the argument
 :obj:`num_hashes` of the forward function so that the output of the LSH self attention better approximates the output
 of the "normal" full self attention. The buckets are then sorted and chunked into query key embedding vector chunks
 each of length :obj:`config.lsh_chunk_length`. For each chunk, the query embedding vectors attend to its key vectors
@@ -92,11 +105,11 @@ neighboring chunks and :obj:`config.lsh_num_chunks_after` following neighboring
 For more information, see the `original Paper <https://arxiv.org/abs/2001.04451>`__ or this great `blog post
 <https://www.pragmatic.ml/reformer-deep-dive/>`__.

-Note that :obj:`config.num_buckets` can also be factorized into a list
-:math:`(n_{\text{buckets}}^1, n_{\text{buckets}}^2)`. This way instead of assigning the query key embedding vectors to
-one of :math:`(1,\ldots, n_{\text{buckets}})` they are assigned to one of
-:math:`(1-1,\ldots, n_{\text{buckets}}^1-1, \ldots, 1-n_{\text{buckets}}^2, \ldots, n_{\text{buckets}}^1-n_{\text{buckets}}^2)`.
-This is crucial for very long sequences to save memory.
+Note that :obj:`config.num_buckets` can also be factorized into a list :math:`(n_{\text{buckets}}^1,
+n_{\text{buckets}}^2)`. This way instead of assigning the query key embedding vectors to one of :math:`(1,\ldots,
+n_{\text{buckets}})` they are assigned to one of :math:`(1-1,\ldots, n_{\text{buckets}}^1-1, \ldots,
+1-n_{\text{buckets}}^2, \ldots, n_{\text{buckets}}^1-n_{\text{buckets}}^2)`. This is crucial for very long sequences to
+save memory.

 When training a model from scratch, it is recommended to leave :obj:`config.num_buckets=None`, so that depending on the
 sequence length a good value for :obj:`num_buckets` is calculated on the fly. This value will then automatically be
@@ -128,7 +141,7 @@ multiple of :obj:`config.lsh_chunk_length` and :obj:`config.local_chunk_length`
 Positional Encodings are correctly set as described above. Reformer is very memory efficient so that the model can
 easily be trained on sequences as long as 64000 tokens.

-For training, the :class:`~transformers.ReformerModelWithLMHead` should be used as follows: 
+For training, the :class:`~transformers.ReformerModelWithLMHead` should be used as follows:

 .. code-block::

@@ -150,6 +163,13 @@ ReformerTokenizer
    :members: save_vocabulary


+ReformerTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.ReformerTokenizerFast
+    :members:
+
+
 ReformerModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/retribert.rst
+++ b/docs/source/model_doc/retribert.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 RetriBERT
 -----------------------------------------------------------------------------------------------------------------------

--- a/docs/source/model_doc/roberta.rst
+++ b/docs/source/model_doc/roberta.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 RoBERTa
 -----------------------------------------------------------------------------------------------------------------------

@@ -8,8 +20,8 @@ The RoBERTa model was proposed in `RoBERTa: A Robustly Optimized BERT Pretrainin
 <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer
 Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov. It is based on Google's BERT model released in 2018.

-It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
-objective and training with much larger mini-batches and learning rates.
+It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining objective and training with
+much larger mini-batches and learning rates.

 The abstract from the paper is the following:

@@ -17,15 +29,15 @@ The abstract from the paper is the following:
 approaches is challenging. Training is computationally expensive, often done on private datasets of different sizes,
 and, as we will show, hyperparameter choices have significant impact on the final results. We present a replication
 study of BERT pretraining (Devlin et al., 2019) that carefully measures the impact of many key hyperparameters and
-training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of
-every model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These
-results highlight the importance of previously overlooked design choices, and raise questions about the source
-of recently reported improvements. We release our models and code.*
+training data size. We find that BERT was significantly undertrained, and can match or exceed the performance of every
+model published after it. Our best model achieves state-of-the-art results on GLUE, RACE and SQuAD. These results
+highlight the importance of previously overlooked design choices, and raise questions about the source of recently
+reported improvements. We release our models and code.*

 Tips:

- This implementation is the same as :class:`~transformers.BertModel` with a tiny embeddings tweak as well as a
-  setup for Roberta pretrained models.
+- This implementation is the same as :class:`~transformers.BertModel` with a tiny embeddings tweak as well as a setup
+  for Roberta pretrained models.
 - RoBERTa has the same architecture as BERT, but uses a byte-level BPE as a tokenizer (same as GPT-2) and uses a
  different pretraining scheme.
 - RoBERTa doesn't have :obj:`token_type_ids`, you don't need to indicate which token belongs to which segment. Just
@@ -146,3 +158,10 @@ TFRobertaForQuestionAnswering

 .. autoclass:: transformers.TFRobertaForQuestionAnswering
    :members: call
+
+
+FlaxRobertaModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.FlaxRobertaModel
+    :members: __call__
--- a/docs/source/model_doc/squeezebert.rst
+++ b/docs/source/model_doc/squeezebert.rst
@@ -0,0 +1,111 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+SqueezeBERT
+-----------------------------------------------------------------------------------------------------------------------
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The SqueezeBERT model was proposed in `SqueezeBERT: What can computer vision teach NLP about efficient neural networks?
+<https://arxiv.org/abs/2006.11316>`__ by Forrest N. Iandola, Albert E. Shaw, Ravi Krishna, Kurt W. Keutzer. It's a
+bidirectional transformer similar to the BERT model. The key difference between the BERT architecture and the
+SqueezeBERT architecture is that SqueezeBERT uses `grouped convolutions <https://blog.yani.io/filter-group-tutorial>`__
+instead of fully-connected layers for the Q, K, V and FFN layers.
+
+The abstract from the paper is the following:
+
+*Humans read and write hundreds of billions of messages every day. Further, due to the availability of large datasets,
+large computing systems, and better neural network models, natural language processing (NLP) technology has made
+significant strides in understanding, proofreading, and organizing these messages. Thus, there is a significant
+opportunity to deploy NLP in myriad applications to help web users, social networks, and businesses. In particular, we
+consider smartphones and other mobile devices as crucial platforms for deploying NLP models at scale. However, today's
+highly-accurate NLP neural network models such as BERT and RoBERTa are extremely computationally expensive, with
+BERT-base taking 1.7 seconds to classify a text snippet on a Pixel 3 smartphone. In this work, we observe that methods
+such as grouped convolutions have yielded significant speedups for computer vision networks, but many of these
+techniques have not been adopted by NLP neural network designers. We demonstrate how to replace several operations in
+self-attention layers with grouped convolutions, and we use this technique in a novel network architecture called
+SqueezeBERT, which runs 4.3x faster than BERT-base on the Pixel 3 while achieving competitive accuracy on the GLUE test
+set. The SqueezeBERT code will be released.*
+
+Tips:
+
+- SqueezeBERT is a model with absolute position embeddings so it's usually advised to pad the inputs on the right
+  rather than the left.
+- SqueezeBERT is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore
+  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
+  with a causal language modeling (CLM) objective are better in that regard.
+- For best results when finetuning on sequence classification tasks, it is recommended to start with the
+  `squeezebert/squeezebert-mnli-headless` checkpoint.
+
+SqueezeBertConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SqueezeBertConfig
+    :members:
+
+
+SqueezeBertTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SqueezeBertTokenizer
+    :members: build_inputs_with_special_tokens, get_special_tokens_mask,
+        create_token_type_ids_from_sequences, save_vocabulary
+
+
+SqueezeBertTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SqueezeBertTokenizerFast
+    :members:
+
+
+SqueezeBertModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SqueezeBertModel
+    :members:
+
+
+SqueezeBertForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SqueezeBertForMaskedLM
+    :members:
+
+
+SqueezeBertForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SqueezeBertForSequenceClassification
+    :members:
+
+
+SqueezeBertForMultipleChoice
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SqueezeBertForMultipleChoice
+    :members:
+
+
+SqueezeBertForTokenClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SqueezeBertForTokenClassification
+    :members:
+
+
+SqueezeBertForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.SqueezeBertForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/t5.rst
+++ b/docs/source/model_doc/t5.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 T5
 -----------------------------------------------------------------------------------------------------------------------

@@ -17,7 +29,7 @@ The abstract from the paper is the following:
 task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning
 has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of
 transfer learning techniques for NLP by introducing a unified framework that converts every language problem into a
-text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled datasets, transfer
+text-to-text format. Our systematic study compares pretraining objectives, architectures, unlabeled datasets, transfer
 approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration
 with scale and our new "Colossal Clean Crawled Corpus", we achieve state-of-the-art results on many benchmarks covering
 summarization, question answering, text classification, and more. To facilitate future work on transfer learning for
@@ -29,13 +41,12 @@ Tips:
  each task is converted into a text-to-text format. T5 works well on a variety of tasks out-of-the-box by prepending a
  different prefix to the input corresponding to each task, e.g., for translation: *translate English to German: ...*,
  for summarization: *summarize: ...*.
-  
+
  For more information about which prefix to use, it is easiest to look into Appendix D of the `paper
-  <https://arxiv.org/pdf/1910.10683.pdf>`__.
- For sequence-to-sequence generation, it is recommended to use :obj:`T5ForConditionalGeneration.generate()``. This
-  method takes care of feeding the encoded input via cross-attention layers to the decoder and auto-regressively
-  generates the decoder output.
- T5 uses relative scalar embeddings. Encoder input padding can be done on the left and on the right.
+  <https://arxiv.org/pdf/1910.10683.pdf>`__. - For sequence-to-sequence generation, it is recommended to use
+  :obj:`T5ForConditionalGeneration.generate()`. This method takes care of feeding the encoded input via cross-attention
+  layers to the decoder and auto-regressively generates the decoder output. - T5 uses relative scalar embeddings.
+  Encoder input padding can be done on the left and on the right.

 The original code can be found `here <https://github.com/google-research/text-to-text-transfer-transformer>`__.

@@ -44,41 +55,41 @@ Training

 T5 is an encoder-decoder model and converts all NLP problems into a text-to-text format. It is trained using teacher
 forcing. This means that for training we always need an input sequence and a target sequence. The input sequence is fed
-to the model using :obj:`input_ids``. The target sequence is shifted to the right, i.e., prepended by a start-sequence
+to the model using :obj:`input_ids`. The target sequence is shifted to the right, i.e., prepended by a start-sequence
 token and fed to the decoder using the :obj:`decoder_input_ids`. In teacher-forcing style, the target sequence is then
 appended by the EOS token and corresponds to the :obj:`labels`. The PAD token is hereby used as the start-sequence
 token. T5 can be trained / fine-tuned both in a supervised and unsupervised fashion.

 - Unsupervised denoising training

-  In this setup spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) 
-  and the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. 
-  Each sentinel token represents a unique mask token for this sentence and should start with :obj:`<extra_id_0>`, 
+  In this setup spans of the input sequence are masked by so-called sentinel tokens (*a.k.a* unique mask tokens) and
+  the output sequence is formed as a concatenation of the same sentinel tokens and the *real* masked tokens. Each
+  sentinel token represents a unique mask token for this sentence and should start with :obj:`<extra_id_0>`,
  :obj:`<extra_id_1>`, ... up to :obj:`<extra_id_99>`. As a default, 100 sentinel tokens are available in
  :class:`~transformers.T5Tokenizer`.
-  
+
  For instance, the sentence "The cute dog walks in the park" with the masks put on "cute dog" and "the" should be
-  processed as follows: 
+  processed as follows:

 .. code-block::

-  input_ids = tokenizer.encode('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt')
-  labels = tokenizer.encode('<extra_id_0> cute dog <extra_id_1> the <extra_id_2> </s>', return_tensors='pt')
+  input_ids = tokenizer('The <extra_id_0> walks in <extra_id_1> park', return_tensors='pt').input_ids
+  labels = tokenizer('<extra_id_0> cute dog <extra_id_1> the <extra_id_2>', return_tensors='pt').input_ids
  # the forward function automatically creates the correct decoder_input_ids
-  model(input_ids=input_ids, labels=labels)
+  loss = model(input_ids=input_ids, labels=labels).loss

 - Supervised training

-  In this setup the input sequence and output sequence are standard sequence-to-sequence input output mapping.
-  In translation, for instance with the input sequence "The house is wonderful." and output sequence "Das Haus ist
+  In this setup the input sequence and output sequence are standard sequence-to-sequence input output mapping. In
+  translation, for instance with the input sequence "The house is wonderful." and output sequence "Das Haus ist
  wunderbar.", the sentences should be processed as follows:
-  
+
 .. code-block::

-  input_ids = tokenizer.encode('translate English to German: The house is wonderful. </s>', return_tensors='pt')
-  labels = tokenizer.encode('Das Haus ist wunderbar. </s>', return_tensors='pt')
+  input_ids = tokenizer('translate English to German: The house is wonderful.', return_tensors='pt').input_ids
+  labels = tokenizer('Das Haus ist wunderbar.', return_tensors='pt').input_ids
  # the forward function automatically creates the correct decoder_input_ids
-  model(input_ids=input_ids, labels=labels)
+  loss = model(input_ids=input_ids, labels=labels).loss


 T5Config
@@ -96,19 +107,31 @@ T5Tokenizer
        create_token_type_ids_from_sequences, prepare_seq2seq_batch, save_vocabulary


+T5TokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.T5TokenizerFast
+    :members:
+
+
 T5Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.T5Model
-    :members: forward
+    :members: forward, parallelize, deparallelize


 T5ForConditionalGeneration
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 .. autoclass:: transformers.T5ForConditionalGeneration
-    :members: forward
+    :members: forward, parallelize, deparallelize

+T5EncoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.T5EncoderModel
+    :members: forward, parallelize, deparallelize

 TFT5Model
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -122,3 +145,9 @@ TFT5ForConditionalGeneration

 .. autoclass:: transformers.TFT5ForConditionalGeneration
    :members: call
+
+TFT5EncoderModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFT5EncoderModel
+    :members: call
--- a/docs/source/model_doc/tapas.rst
+++ b/docs/source/model_doc/tapas.rst
@@ -0,0 +1,434 @@
+TAPAS
+-----------------------------------------------------------------------------------------------------------------------
+
+.. note::
+
+    This is a recently introduced model so the API hasn't been tested extensively. There may be some bugs or slight
+    breaking changes to fix them in the future.
+
+
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The TAPAS model was proposed in `TAPAS: Weakly Supervised Table Parsing via Pre-training
+<https://www.aclweb.org/anthology/2020.acl-main.398>`__ by Jonathan Herzig, Paweł Krzysztof Nowak, Thomas Müller,
+Francesco Piccinno and Julian Martin Eisenschlos. It's a BERT-based model specifically designed (and pre-trained) for
+answering questions about tabular data. Compared to BERT, TAPAS uses relative position embeddings and has 7 token types
+that encode tabular structure. TAPAS is pre-trained on the masked language modeling (MLM) objective on a large dataset
+comprising millions of tables from English Wikipedia and corresponding texts. For question answering, TAPAS has 2 heads
+on top: a cell selection head and an aggregation head, for (optionally) performing aggregations (such as counting or
+summing) among selected cells. TAPAS has been fine-tuned on several datasets: `SQA
+<https://www.microsoft.com/en-us/download/details.aspx?id=54253>`__ (Sequential Question Answering by Microsoft), `WTQ
+<https://github.com/ppasupat/WikiTableQuestions>`__ (Wiki Table Questions by Stanford University) and `WikiSQL
+<https://github.com/salesforce/WikiSQL>`__ (by Salesforce). It achieves state-of-the-art on both SQA and WTQ, while
+having comparable performance to SOTA on WikiSQL, with a much simpler architecture.
+
+The abstract from the paper is the following:
+
+*Answering natural language questions over tables is usually seen as a semantic parsing task. To alleviate the
+collection cost of full logical forms, one popular approach focuses on weak supervision consisting of denotations
+instead of logical forms. However, training semantic parsers from weak supervision poses difficulties, and in addition,
+the generated logical forms are only used as an intermediate step prior to retrieving the denotation. In this paper, we
+present TAPAS, an approach to question answering over tables without generating logical forms. TAPAS trains from weak
+supervision, and predicts the denotation by selecting table cells and optionally applying a corresponding aggregation
+operator to such selection. TAPAS extends BERT's architecture to encode tables as input, initializes from an effective
+joint pre-training of text segments and tables crawled from Wikipedia, and is trained end-to-end. We experiment with
+three different semantic parsing datasets, and find that TAPAS outperforms or rivals semantic parsing models by
+improving state-of-the-art accuracy on SQA from 55.1 to 67.2 and performing on par with the state-of-the-art on WIKISQL
+and WIKITQ, but with a simpler model architecture. We additionally find that transfer learning, which is trivial in our
+setting, from WIKISQL to WIKITQ, yields 48.7 accuracy, 4.2 points above the state-of-the-art.*
+
+In addition, the authors have further pre-trained TAPAS to recognize **table entailment**, by creating a balanced
+dataset of millions of automatically created training examples which are learned in an intermediate step prior to
+fine-tuning. The authors of TAPAS call this further pre-training intermediate pre-training (since TAPAS is first
+pre-trained on MLM, and then on another dataset). They found that intermediate pre-training further improves
+performance on SQA, achieving a new state-of-the-art as well as state-of-the-art on `TabFact
+<https://github.com/wenhuchen/Table-Fact-Checking>`__, a large-scale dataset with 16k Wikipedia tables for table
+entailment (a binary classification task). For more details, see their follow-up paper: `Understanding tables with
+intermediate pre-training <https://www.aclweb.org/anthology/2020.findings-emnlp.27/>`__ by Julian Martin Eisenschlos,
+Syrine Krichene and Thomas Müller.
+
+The original code can be found `here <https://github.com/google-research/tapas>`__.
+
+Tips:
+
+- TAPAS is a model that uses relative position embeddings by default (restarting the position embeddings at every cell
+  of the table). Note that this is something that was added after the publication of the original TAPAS paper.
+  According to the authors, this usually results in a slightly better performance, and allows you to encode longer
+  sequences without running out of embeddings. This is reflected in the ``reset_position_index_per_cell`` parameter of
+  :class:`~transformers.TapasConfig`, which is set to ``True`` by default. The default versions of the models available
+  in the `model hub <https://huggingface.co/models?search=tapas>`_ all use relative position embeddings. You can still
+  use the ones with absolute position embeddings by passing in an additional argument ``revision="no_reset"`` when
+  calling the ``.from_pretrained()`` method. Note that it's usually advised to pad the inputs on the right rather than
+  the left.
+- TAPAS is based on BERT, so ``TAPAS-base`` for example corresponds to a ``BERT-base`` architecture. Of course,
+  TAPAS-large will result in the best performance (the results reported in the paper are from TAPAS-large). Results of
+  the various sized models are shown on the `original Github repository <https://github.com/google-research/tapas>`_.
+- TAPAS has checkpoints fine-tuned on SQA, which are capable of answering questions related to a table in a
+  conversational set-up. This means that you can ask follow-up questions such as "what is his age?" related to the
+  previous question. Note that the forward pass of TAPAS is a bit different in case of a conversational set-up: in that
+  case, you have to feed every table-question pair one by one to the model, such that the `prev_labels` token type ids
+  can be overwritten by the predicted `labels` of the model to the previous question. See "Usage" section for more
+  info.
+- TAPAS is similar to BERT and therefore relies on the masked language modeling (MLM) objective. It is therefore
+  efficient at predicting masked tokens and at NLU in general, but is not optimal for text generation. Models trained
+  with a causal language modeling (CLM) objective are better in that regard.
+
+
+Usage: fine-tuning
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here we explain how you can fine-tune :class:`~transformers.TapasForQuestionAnswering` on your own dataset.
+
+**STEP 1: Choose one of the 3 ways in which you can use TAPAS - or experiment**
+
+Basically, there are 3 different ways in which one can fine-tune :class:`~transformers.TapasForQuestionAnswering`,
+corresponding to the different datasets on which Tapas was fine-tuned:
+
+1. SQA: if you're interested in asking follow-up questions related to a table, in a conversational set-up. For example
+   if you first ask "what's the name of the first actor?" then you can ask a follow-up question such as "how old is
+   he?". Here, questions do not involve any aggregation (all questions are cell selection questions).
+2. WTQ: if you're not interested in asking questions in a conversational set-up, but rather just asking questions
+   related to a table, which might involve aggregation, such as counting a number of rows, summing up cell values or
+   averaging cell values. You can then for example ask "what's the total number of goals Cristiano Ronaldo made in his
+   career?". This case is also called **weak supervision**, since the model itself must learn the appropriate
+   aggregation operator (SUM/COUNT/AVERAGE/NONE) given only the answer to the question as supervision.
+3. WikiSQL-supervised: this dataset is based on WikiSQL with the model being given the ground truth aggregation
+   operator during training. This is also called **strong supervision**. Here, learning the appropriate aggregation
+   operator is much easier.
+
+To summarize:
+
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+| **Task**                           | **Example dataset**  | **Description**                                                                                                   |
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+| Conversational                     | SQA                  | Conversational, only cell selection questions                                                                     |
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+| Weak supervision for aggregation   | WTQ                  | Questions might involve aggregation, and the model must learn this given only the answer as supervision           |
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+| Strong supervision for aggregation | WikiSQL-supervised   | Questions might involve aggregation, and the model must learn this given the gold aggregation operator            |
+------------------------------------+----------------------+-------------------------------------------------------------------------------------------------------------------+
+
+Initializing a model with a pre-trained base and randomly initialized classification heads from the model hub can be
+done as follows (be sure to have installed the `torch-scatter dependency <https://github.com/rusty1s/pytorch_scatter>`_
+for your environment):
+
+.. code-block::
+
+        >>> from transformers import TapasConfig, TapasForQuestionAnswering
+
+        >>> # for example, the base sized model with default SQA configuration
+        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base')
+
+        >>> # or, the base sized model with WTQ configuration
+        >>> config = TapasConfig.from_pretrained('google/tapas-base-finetuned-wtq')
+        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+
+        >>> # or, the base sized model with WikiSQL configuration
+        >>> config = TapasConfig('google-base-finetuned-wikisql-supervised')
+        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+
+
+Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also
+experiment by defining any hyperparameters you want when initializing :class:`~transformers.TapasConfig`, and then
+create a :class:`~transformers.TapasForQuestionAnswering` based on that configuration. For example, if you have a
+dataset that has both conversational questions and questions that might involve aggregation, then you can do it this
+way. Here's an example:
+
+.. code-block::
+
+        >>> from transformers import TapasConfig, TapasForQuestionAnswering
+
+        >>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
+        >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True, select_one_column=False)
+        >>> # initializing the pre-trained base sized model with our custom classification heads
+        >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+
+What you can also do is start from an already fine-tuned checkpoint. A note here is that the already fine-tuned
+checkpoint on WTQ has some issues due to the L2-loss which is somewhat brittle. See `here
+<https://github.com/google-research/tapas/issues/91#issuecomment-735719340>`__ for more info.
+
+For a list of all pre-trained and fine-tuned TAPAS checkpoints available in the HuggingFace model hub, see `here
+<https://huggingface.co/models?search=tapas>`__.
+
+**STEP 2: Prepare your data in the SQA format**
+
+Second, no matter what you picked above, you should prepare your dataset in the `SQA format
+<https://www.microsoft.com/en-us/download/details.aspx?id=54253>`__. This format is a TSV/CSV file with the following
+columns:
+
+- ``id``: optional, id of the table-question pair, for bookkeeping purposes.
+- ``annotator``: optional, id of the person who annotated the table-question pair, for bookkeeping purposes.
+- ``position``: integer indicating if the question is the first, second, third,... related to the table. Only required
+  in case of conversational setup (SQA). You don't need this column in case you're going for WTQ/WikiSQL-supervised.
+- ``question``: string
+- ``table_file``: string, name of a csv file containing the tabular data
+- ``answer_coordinates``: list of one or more tuples (each tuple being a cell coordinate, i.e. row, column pair that is
+  part of the answer)
+- ``answer_text``: list of one or more strings (each string being a cell value that is part of the answer)
+- ``aggregation_label``: index of the aggregation operator. Only required in case of strong supervision for aggregation
+  (the WikiSQL-supervised case)
+- ``float_answer``: the float answer to the question, if there is one (np.nan if there isn't). Only required in case of
+  weak supervision for aggregation (such as WTQ and WikiSQL)
+
+The tables themselves should be present in a folder, each table being a separate csv file. Note that the authors of the
+TAPAS algorithm used conversion scripts with some automated logic to convert the other datasets (WTQ, WikiSQL) into the
+SQA format. The author explains this `here
+<https://github.com/google-research/tapas/issues/50#issuecomment-705465960>`__. Interestingly, these conversion scripts
+are not perfect (the ``answer_coordinates`` and ``float_answer`` fields are populated based on the ``answer_text``),
+meaning that WTQ and WikiSQL results could actually be improved.
+
+**STEP 3: Convert your data into PyTorch tensors using TapasTokenizer**
+
+Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular
+data), you can then use :class:`~transformers.TapasTokenizer` to convert table-question pairs into :obj:`input_ids`,
+:obj:`attention_mask`, :obj:`token_type_ids` and so on. Again, based on which of the three cases you picked above,
+:class:`~transformers.TapasForQuestionAnswering` requires different inputs to be fine-tuned:
+
+------------------------------------+----------------------------------------------------------------------------------------------+
+| **Task**                           | **Required inputs**                                                                          |
+------------------------------------+----------------------------------------------------------------------------------------------+
+| Conversational                     | ``input_ids``, ``attention_mask``, ``token_type_ids``, ``labels``                            |
+------------------------------------+----------------------------------------------------------------------------------------------+
+| Weak supervision for aggregation   | ``input_ids``, ``attention_mask``, ``token_type_ids``, ``labels``, ``numeric_values``,       |
+|                                    | ``numeric_values_scale``, ``float_answer``                                                   |
+------------------------------------+----------------------------------------------------------------------------------------------+
+| Strong supervision for aggregation | ``input ids``, ``attention mask``, ``token type ids``, ``labels``, ``aggregation_labels``    |
+------------------------------------+----------------------------------------------------------------------------------------------+
+
+:class:`~transformers.TapasTokenizer` creates the ``labels``, ``numeric_values`` and ``numeric_values_scale`` based on
+the ``answer_coordinates`` and ``answer_text`` columns of the TSV file. The ``float_answer`` and ``aggregation_labels``
+are already in the TSV file of step 2. Here's an example:
+
+.. code-block::
+
+        >>> from transformers import TapasTokenizer
+        >>> import pandas as pd
+
+        >>> model_name = 'google/tapas-base'
+        >>> tokenizer = TapasTokenizer.from_pretrained(model_name)
+
+        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
+        >>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+        >>> answer_coordinates = [[(0, 0)], [(2, 1)], [(0, 1), (1, 1), (2, 1)]]
+        >>> answer_text = [["Brad Pitt"], ["69"], ["209"]]
+        >>> table = pd.DataFrame.from_dict(data)
+        >>> inputs = tokenizer(table=table, queries=queries, answer_coordinates=answer_coordinates, answer_text=answer_text, padding='max_length', return_tensors='pt')
+        >>> inputs
+        {'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]),
+        'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])}
+
+Note that :class:`~transformers.TapasTokenizer` expects the data of the table to be **text-only**. You can use
+``.astype(str)`` on a dataframe to turn it into text-only data. Of course, this only shows how to encode a single
+training example. It is advised to create a PyTorch dataset and a corresponding dataloader:
+
+.. code-block::
+
+        >>> import torch
+        >>> import pandas as pd
+
+        >>> tsv_path = "your_path_to_the_tsv_file"
+        >>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files"
+
+        >>> class TableDataset(torch.utils.data.Dataset):
+        ...     def __init__(self, data, tokenizer):
+        ...         self.data = data
+        ...         self.tokenizer = tokenizer
+        ...
+        ...     def __getitem__(self, idx):
+        ...         item = data.iloc[idx]
+        ...         table = pd.read_csv(table_csv_path + item.table_file).astype(str) # be sure to make your table data text only
+        ...         encoding = self.tokenizer(table=table, 
+        ...                                   queries=item.question, 
+        ...                                   answer_coordinates=item.answer_coordinates, 
+        ...                                   answer_text=item.answer_text,
+        ...                                   truncation=True,
+        ...                                   padding="max_length",
+        ...                                   return_tensors="pt"
+        ...         )
+        ...         # remove the batch dimension which the tokenizer adds by default
+        ...         encoding = {key: val.squeeze(0) for key, val in encoding.items()}
+        ...         # add the float_answer which is also required (weak supervision for aggregation case)
+        ...         encoding["float_answer"] = torch.tensor(item.float_answer) 
+        ...         return encoding
+        ...
+        ...     def __len__(self):
+        ...        return len(self.data)
+
+        >>> data = pd.read_csv(tsv_path, sep='\t')
+        >>> train_dataset = TableDataset(data, tokenizer)
+        >>> train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
+
+Note that here, we encode each table-question pair independently. This is fine as long as your dataset is **not
+conversational**. In case your dataset involves conversational questions (such as in SQA), then you should first group
+together the ``queries``, ``answer_coordinates`` and ``answer_text`` per table (in the order of their ``position``
+index) and batch encode each table with its questions. This will make sure that the ``prev_labels`` token types (see
+docs of :class:`~transformers.TapasTokenizer`) are set correctly. See `this notebook
+<https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb>`__
+for more info.
+
+**STEP 4: Train (fine-tune) TapasForQuestionAnswering**
+
+You can then fine-tune :class:`~transformers.TapasForQuestionAnswering` using native PyTorch as follows (shown here for
+the weak supervision for aggregation case):
+
+.. code-block::
+
+        >>> from transformers import TapasConfig, TapasForQuestionAnswering, AdamW
+
+        >>> # this is the default WTQ configuration
+        >>> config = TapasConfig(
+        ...            num_aggregation_labels = 4,
+        ...            use_answer_as_supervision = True,
+        ...            answer_loss_cutoff = 0.664694,
+        ...            cell_selection_preference = 0.207951,
+        ...            huber_loss_delta = 0.121194,
+        ...            init_cell_selection_weights_to_zero = True,
+        ...            select_one_column = True,
+        ...            allow_empty_column_selection = False,
+        ...            temperature = 0.0352513,
+        ... )
+        >>> model = TapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
+
+        >>> optimizer = AdamW(model.parameters(), lr=5e-5)
+
+        >>> for epoch in range(2):  # loop over the dataset multiple times
+        ...    for idx, batch in enumerate(train_dataloader):
+        ...         # get the inputs; 
+        ...         input_ids = batch["input_ids"]
+        ...         attention_mask = batch["attention_mask"]
+        ...         token_type_ids = batch["token_type_ids"]
+        ...         labels = batch["labels"]
+        ...         numeric_values = batch["numeric_values"]
+        ...         numeric_values_scale = batch["numeric_values_scale"]
+        ...         float_answer = batch["float_answer"]
+
+        ...         # zero the parameter gradients
+        ...         optimizer.zero_grad()
+
+        ...         # forward + backward + optimize
+        ...         outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, 
+        ...                        labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, 
+        ...                        float_answer=float_answer)
+        ...         loss = outputs.loss
+        ...         loss.backward()
+        ...         optimizer.step()
+
+Usage: inference
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Here we explain how you can use :class:`~transformers.TapasForQuestionAnswering` for inference (i.e. making predictions
+on new data). For inference, only ``input_ids``, ``attention_mask`` and ``token_type_ids`` (which you can obtain using
+:class:`~transformers.TapasTokenizer`) have to be provided to the model to obtain the logits. Next, you can use the
+handy ``convert_logits_to_predictions`` method of :class:`~transformers.TapasTokenizer` to convert these into predicted
+coordinates and optional aggregation indices.
+
+However, note that inference is **different** depending on whether or not the setup is conversational. In a
+non-conversational set-up, inference can be done in parallel on all table-question pairs of a batch. Here's an example
+of that:
+
+.. code-block::
+
+        >>> from transformers import TapasTokenizer, TapasForQuestionAnswering
+        >>> import pandas as pd 
+
+        >>> model_name = 'google/tapas-base-finetuned-wtq'
+        >>> model = TapasForQuestionAnswering.from_pretrained(model_name)
+        >>> tokenizer = TapasTokenizer.from_pretrained(model_name)
+
+        >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
+        >>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+        >>> table = pd.DataFrame.from_dict(data)
+        >>> inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="pt") 
+        >>> outputs = model(**inputs)
+        >>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
+        ...         inputs, 
+        ...         outputs.logits.detach(), 
+        ...         outputs.logits_aggregation.detach()
+        ... )
+
+        >>> # let's print out the results:
+        >>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
+        >>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
+
+        >>> answers = []
+        >>> for coordinates in predicted_answer_coordinates:
+        ...   if len(coordinates) == 1:
+        ...     # only a single cell:
+        ...     answers.append(table.iat[coordinates[0]])
+        ...   else:
+        ...     # multiple cells
+        ...     cell_values = []
+        ...     for coordinate in coordinates:
+        ...        cell_values.append(table.iat[coordinate])
+        ...     answers.append(", ".join(cell_values))
+
+        >>> display(table)
+        >>> print("")
+        >>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
+        ...   print(query)
+        ...   if predicted_agg == "NONE":
+        ...     print("Predicted answer: " + answer)
+        ...   else:
+        ...     print("Predicted answer: " + predicted_agg + " > " + answer)    
+        What is the name of the first actor?
+        Predicted answer: Brad Pitt
+        How many movies has George Clooney played in?
+        Predicted answer: COUNT > 69
+        What is the total number of movies?
+        Predicted answer: SUM > 87, 53, 69
+
+In case of a conversational set-up, then each table-question pair must be provided **sequentially** to the model, such
+that the ``prev_labels`` token types can be overwritten by the predicted ``labels`` of the previous table-question
+pair. Again, more info can be found in `this notebook
+<https://github.com/NielsRogge/Transformers-Tutorials/blob/master/TAPAS/Fine_tuning_TapasForQuestionAnswering_on_SQA.ipynb>`__.
+
+
+Tapas specific outputs
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.models.tapas.modeling_tapas.TableQuestionAnsweringOutput
+    :members:
+
+
+TapasConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasConfig
+    :members:
+
+
+TapasTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasTokenizer
+    :members: __call__, convert_logits_to_predictions, save_vocabulary
+
+
+TapasModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasModel
+    :members: forward
+
+
+TapasForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasForMaskedLM
+    :members: forward
+
+
+TapasForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasForSequenceClassification
+    :members: forward
+
+
+TapasForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TapasForQuestionAnswering
+    :members: forward
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Transformer XL
 -----------------------------------------------------------------------------------------------------------------------

@@ -14,19 +26,19 @@ The abstract from the paper is the following:

 *Transformers have a potential of learning longer-term dependency, but are limited by a fixed-length context in the
 setting of language modeling. We propose a novel neural architecture Transformer-XL that enables learning dependency
-beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and
-a novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves
-the context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and
-450% longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up
-to 1,800+ times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results
-of bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on
-Penn Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
+beyond a fixed length without disrupting temporal coherence. It consists of a segment-level recurrence mechanism and a
+novel positional encoding scheme. Our method not only enables capturing longer-term dependency, but also resolves the
+context fragmentation problem. As a result, Transformer-XL learns dependency that is 80% longer than RNNs and 450%
+longer than vanilla Transformers, achieves better performance on both short and long sequences, and is up to 1,800+
+times faster than vanilla Transformers during evaluation. Notably, we improve the state-of-the-art results of
+bpc/perplexity to 0.99 on enwiki8, 1.08 on text8, 18.3 on WikiText-103, 21.8 on One Billion Word, and 54.5 on Penn
+Treebank (without finetuning). When trained only on WikiText-103, Transformer-XL manages to generate reasonably
 coherent, novel text articles with thousands of tokens.*

 Tips:

- Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right.
-  The original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
+- Transformer-XL uses relative sinusoidal positional embeddings. Padding can be done on the left or on the right. The
+  original implementation trains on SQuAD with padding on the left, therefore the padding defaults are set to left.
 - Transformer-XL is one of the few models that has no sequence length limit.

 The original code can be found `here <https://github.com/kimiyoung/transformer-xl>`__.
@@ -46,26 +58,19 @@ TransfoXLTokenizer
    :members: save_vocabulary


-TransfoXLTokenizerFast
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: transformers.TransfoXLTokenizerFast
-    :members:
-
-
 TransfoXL specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_transfo_xl.TransfoXLModelOutput
+.. autoclass:: transformers.models.transfo_xl.modeling_transfo_xl.TransfoXLModelOutput
    :members:

-.. autoclass:: transformers.modeling_transfo_xl.TransfoXLLMHeadModelOutput
+.. autoclass:: transformers.models.transfo_xl.modeling_transfo_xl.TransfoXLLMHeadModelOutput
    :members:

-.. autoclass:: transformers.modeling_tf_transfo_xl.TFTransfoXLModelOutput
+.. autoclass:: transformers.models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLModelOutput
    :members:

-.. autoclass:: transformers.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput
+.. autoclass:: transformers.models.transfo_xl.modeling_tf_transfo_xl.TFTransfoXLLMHeadModelOutput
    :members:


@@ -83,6 +88,13 @@ TransfoXLLMHeadModel
    :members: forward


+TransfoXLForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TransfoXLForSequenceClassification
+    :members: forward
+
+
 TFTransfoXLModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

@@ -95,3 +107,18 @@ TFTransfoXLLMHeadModel

 .. autoclass:: transformers.TFTransfoXLLMHeadModel
    :members: call
+
+
+TFTransfoXLForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTransfoXLForSequenceClassification
+    :members: call
+
+
+Internal Layers
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.AdaptiveEmbedding
+
+.. autoclass:: transformers.TFAdaptiveEmbedding
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 XLM
 -----------------------------------------------------------------------------------------------------------------------

@@ -14,21 +26,21 @@ Guillaume Lample, Alexis Conneau. It's a transformer pretrained using one of the
 The abstract from the paper is the following:

 *Recent studies have demonstrated the efficiency of generative pretraining for English natural language understanding.
-In this work, we extend this approach to multiple languages and show the effectiveness of cross-lingual pretraining.
-We propose two methods to learn cross-lingual language models (XLMs): one unsupervised that only relies on monolingual
+In this work, we extend this approach to multiple languages and show the effectiveness of cross-lingual pretraining. We
+propose two methods to learn cross-lingual language models (XLMs): one unsupervised that only relies on monolingual
 data, and one supervised that leverages parallel data with a new cross-lingual language model objective. We obtain
-state-of-the-art results on cross-lingual classification, unsupervised and supervised machine translation. On XNLI,
-our approach pushes the state of the art by an absolute gain of 4.9% accuracy. On unsupervised machine translation,
-we obtain 34.3 BLEU on WMT'16 German-English, improving the previous state of the art by more than 9 BLEU. On
-supervised machine translation, we obtain a new state of the art of 38.5 BLEU on WMT'16 Romanian-English, outperforming
-the previous best approach by more than 4 BLEU. Our code and pretrained models will be made publicly available.*
+state-of-the-art results on cross-lingual classification, unsupervised and supervised machine translation. On XNLI, our
+approach pushes the state of the art by an absolute gain of 4.9% accuracy. On unsupervised machine translation, we
+obtain 34.3 BLEU on WMT'16 German-English, improving the previous state of the art by more than 9 BLEU. On supervised
+machine translation, we obtain a new state of the art of 38.5 BLEU on WMT'16 Romanian-English, outperforming the
+previous best approach by more than 4 BLEU. Our code and pretrained models will be made publicly available.*

 Tips:

 - XLM has many different checkpoints, which were trained using different objectives: CLM, MLM or TLM. Make sure to
  select the correct objective for your task (e.g. MLM checkpoints are not suitable for generation).
- XLM has multilingual checkpoints which leverage a specific :obj:`lang` parameter. Check out the
-  :doc:`multi-lingual <../multilingual>` page for more information.
+- XLM has multilingual checkpoints which leverage a specific :obj:`lang` parameter. Check out the :doc:`multi-lingual
+  <../multilingual>` page for more information.

 The original code can be found `here <https://github.com/facebookresearch/XLM/>`__.

@@ -50,7 +62,7 @@ XLMTokenizer
 XLM specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_xlm.XLMForQuestionAnsweringOutput
+.. autoclass:: transformers.models.xlm.modeling_xlm.XLMForQuestionAnsweringOutput
    :members:


--- a/docs/source/model_doc/xlmprophetnet.rst
+++ b/docs/source/model_doc/xlmprophetnet.rst
@@ -0,0 +1,87 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
+XLM-ProphetNet
+-----------------------------------------------------------------------------------------------------------------------
+
+**DISCLAIMER:** If you see something strange, file a `Github Issue
+<https://github.com/huggingface/transformers/issues/new?assignees=&labels=&template=bug-report.md&title>`__ and assign
+@patrickvonplaten
+
+
+Overview
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+The XLM-ProphetNet model was proposed in `ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training,
+<https://arxiv.org/abs/2001.04063>`__ by Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei
+Zhang, Ming Zhou on 13 Jan, 2020.
+
+XLM-ProphetNet is an encoder-decoder model and can predict n-future tokens for "ngram" language modeling instead of
+just the next token. Its architecture is identical to ProhpetNet, but the model was trained on the multi-lingual
+"wiki100" Wikipedia dump.
+
+The abstract from the paper is the following:
+
+*In this paper, we present a new sequence-to-sequence pretraining model called ProphetNet, which introduces a novel
+self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism. Instead of
+the optimization of one-step ahead prediction in traditional sequence-to-sequence model, the ProphetNet is optimized by
+n-step ahead prediction which predicts the next n tokens simultaneously based on previous context tokens at each time
+step. The future n-gram prediction explicitly encourages the model to plan for the future tokens and prevent
+overfitting on strong local correlations. We pre-train ProphetNet using a base scale dataset (16GB) and a large scale
+dataset (160GB) respectively. Then we conduct experiments on CNN/DailyMail, Gigaword, and SQuAD 1.1 benchmarks for
+abstractive summarization and question generation tasks. Experimental results show that ProphetNet achieves new
+state-of-the-art results on all these datasets compared to the models using the same scale pretraining corpus.*
+
+The Authors' code can be found `here <https://github.com/microsoft/ProphetNet>`__.
+
+XLMProphetNetConfig
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMProphetNetConfig
+    :members:
+
+
+XLMProphetNetTokenizer
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMProphetNetTokenizer
+    :members:
+
+
+XLMProphetNetModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMProphetNetModel
+
+
+XLMProphetNetEncoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMProphetNetEncoder
+
+
+XLMProphetNetDecoder
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMProphetNetDecoder
+
+
+XLMProphetNetForConditionalGeneration
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMProphetNetForConditionalGeneration
+
+
+XLMProphetNetForCausalLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMProphetNetForCausalLM
--- a/docs/source/model_doc/xlmroberta.rst
+++ b/docs/source/model_doc/xlmroberta.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 XLM-RoBERTa
 -----------------------------------------------------------------------------------------------------------------------

@@ -12,25 +24,25 @@ data.

 The abstract from the paper is the following:

-*This paper shows that pretraining multilingual language models at scale leads to significant performance gains for
-a wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred
+*This paper shows that pretraining multilingual language models at scale leads to significant performance gains for a
+wide range of cross-lingual transfer tasks. We train a Transformer-based masked language model on one hundred
 languages, using more than two terabytes of filtered CommonCrawl data. Our model, dubbed XLM-R, significantly
-outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average accuracy
-on XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs particularly well on
-low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the previous XLM model.
-We also present a detailed empirical evaluation of the key factors that are required to achieve these gains,
-including the trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and
-low resource languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling
-without sacrificing per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE
-and XNLI benchmarks. We will make XLM-R code, data, and models publicly available.*
+outperforms multilingual BERT (mBERT) on a variety of cross-lingual benchmarks, including +13.8% average accuracy on
+XNLI, +12.3% average F1 score on MLQA, and +2.1% average F1 score on NER. XLM-R performs particularly well on
+low-resource languages, improving 11.8% in XNLI accuracy for Swahili and 9.2% for Urdu over the previous XLM model. We
+also present a detailed empirical evaluation of the key factors that are required to achieve these gains, including the
+trade-offs between (1) positive transfer and capacity dilution and (2) the performance of high and low resource
+languages at scale. Finally, we show, for the first time, the possibility of multilingual modeling without sacrificing
+per-language performance; XLM-Ris very competitive with strong monolingual models on the GLUE and XNLI benchmarks. We
+will make XLM-R code, data, and models publicly available.*

 Tips:

 - XLM-RoBERTa is a multilingual model trained on 100 different languages. Unlike some XLM multilingual models, it does
  not require :obj:`lang` tensors to understand which language is used, and should be able to determine the correct
  language from the input ids.
- This implementation is the same as RoBERTa. Refer to the :doc:`documentation of RoBERTa <roberta>` for usage
-  examples as well as the information relative to the inputs and outputs.
+- This implementation is the same as RoBERTa. Refer to the :doc:`documentation of RoBERTa <roberta>` for usage examples
+  as well as the information relative to the inputs and outputs.

 The original code can be found `here <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`__.

@@ -50,6 +62,13 @@ XLMRobertaTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+XLMRobertaTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLMRobertaTokenizerFast
+    :members:
+
+
 XLMRobertaModel
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 XLNet
 -----------------------------------------------------------------------------------------------------------------------

@@ -16,11 +28,11 @@ The abstract from the paper is the following:
 better performance than pretraining approaches based on autoregressive language modeling. However, relying on
 corrupting the input with masks, BERT neglects dependency between the masked positions and suffers from a
 pretrain-finetune discrepancy. In light of these pros and cons, we propose XLNet, a generalized autoregressive
-pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over
-all permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive
-formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model,
-into pretraining. Empirically, under comparable experiment settings, XLNet outperforms BERT on 20 tasks, often by
-a large margin, including question answering, natural language inference, sentiment analysis, and document ranking.*
+pretraining method that (1) enables learning bidirectional contexts by maximizing the expected likelihood over all
+permutations of the factorization order and (2) overcomes the limitations of BERT thanks to its autoregressive
+formulation. Furthermore, XLNet integrates ideas from Transformer-XL, the state-of-the-art autoregressive model, into
+pretraining. Empirically, under comparable experiment settings, XLNet outperforms BERT on 20 tasks, often by a large
+margin, including question answering, natural language inference, sentiment analysis, and document ranking.*

 Tips:

@@ -50,46 +62,53 @@ XLNetTokenizer
        create_token_type_ids_from_sequences, save_vocabulary


+XLNetTokenizerFast
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.XLNetTokenizerFast
+    :members:
+
+
 XLNet specific outputs
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. autoclass:: transformers.modeling_xlnet.XLNetModelOutput
+.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetModelOutput
    :members:

-.. autoclass:: transformers.modeling_xlnet.XLNetLMHeadModelOutput
+.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetLMHeadModelOutput
    :members:

-.. autoclass:: transformers.modeling_xlnet.XLNetForSequenceClassificationOutput
+.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForSequenceClassificationOutput
    :members:

-.. autoclass:: transformers.modeling_xlnet.XLNetForMultipleChoiceOutput
+.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForMultipleChoiceOutput
    :members:

-.. autoclass:: transformers.modeling_xlnet.XLNetForTokenClassificationOutput
+.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForTokenClassificationOutput
    :members:

-.. autoclass:: transformers.modeling_xlnet.XLNetForQuestionAnsweringSimpleOutput
+.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringSimpleOutput
    :members:

-.. autoclass:: transformers.modeling_xlnet.XLNetForQuestionAnsweringOutput
+.. autoclass:: transformers.models.xlnet.modeling_xlnet.XLNetForQuestionAnsweringOutput
    :members:

-.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetModelOutput
+.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetModelOutput
    :members:

-.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetLMHeadModelOutput
+.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetLMHeadModelOutput
    :members:

-.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput
+.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetForSequenceClassificationOutput
    :members:

-.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput
+.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetForMultipleChoiceOutput
    :members:

-.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput
+.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetForTokenClassificationOutput
    :members:

-.. autoclass:: transformers.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput
+.. autoclass:: transformers.models.xlnet.modeling_tf_xlnet.TFXLNetForQuestionAnsweringSimpleOutput
    :members:


--- a/docs/source/model_sharing.rst
+++ b/docs/source/model_sharing.rst
@@ -1,3 +1,15 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Model sharing and uploading
 =======================================================================================================================

@@ -15,42 +27,92 @@ Prepare your model for uploading

 We have seen in the :doc:`training tutorial <training>`: how to fine-tune a model on a given task. You have probably
 done something similar on your task, either using the model directly in your own training loop or using the
-:class:`~.transformers.Trainer`/:class:`~.transformers.TFTrainer` class. Let's see how you can share the result on
-the `model hub <https://huggingface.co/models>`__.
+:class:`~.transformers.Trainer`/:class:`~.transformers.TFTrainer` class. Let's see how you can share the result on the
+`model hub <https://huggingface.co/models>`__.
+
+Model versioning
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Since version v3.5.0, the model hub has built-in model versioning based on git and git-lfs. It is based on the paradigm
+that one model *is* one repo.
+
+This allows:
+
+- built-in versioning
+- access control
+- scalability
+
+This is built around *revisions*, which is a way to pin a specific version of a model, using a commit hash, tag or
+branch.
+
+For instance:
+
+.. code-block::
+
+    >>> model = AutoModel.from_pretrained(
+    >>>   "julien-c/EsperBERTo-small",
+    >>>   revision="v2.0.1" # tag name, or branch name, or commit hash
+    >>> )

 Basic steps
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-.. 
-    When #5258 is merged, we can remove the need to create the directory.
+In order to upload a model, you'll need to first create a git repo. This repo will live on the model hub, allowing
+users to clone it and you (and your organization members) to push to it.

-First, pick a directory with the name you want your model to have on the model hub (its full name will then be
-`username/awesome-name-you-picked` or `organization/awesome-name-you-picked`) and create it with either
+You can create a model repo **directly from `the /new page on the website <https://huggingface.co/new>`__.**

-.. code-block::
+Alternatively, you can use the ``transformers-cli``. The next steps describe that process:

-    mkdir path/to/awesome-name-you-picked
+Go to a terminal and run the following command. It should be in the virtual environment where you installed 🤗
+Transformers, since that command :obj:`transformers-cli` comes from the library.

-or in python
+.. code-block:: bash

-.. code-block::
+    transformers-cli login

-    import os
-    os.makedirs("path/to/awesome-name-you-picked")

-then you can save your model and tokenizer with:
+Once you are logged in with your model hub credentials, you can start building your repositories. To create a repo:

-.. code-block::
+.. code-block:: bash

-    model.save_pretrained("path/to/awesome-name-you-picked")
-    tokenizer.save_pretrained("path/to/awesome-name-you-picked")
+    transformers-cli repo create your-model-name

-Or, if you're using the Trainer API
+If you want to create a repo under a specific organization, you should add a `--organization` flag:

-.. code-block::
+.. code-block:: bash

-    trainer.save_model("path/to/awesome-name-you-picked")
-    tokenizer.save_pretrained("path/to/awesome-name-you-picked")
+    transformers-cli repo create your-model-name --organization your-org-name
+
+This creates a repo on the model hub, which can be cloned.
+
+.. code-block:: bash
+
+    # Make sure you have git-lfs installed
+    # (https://git-lfs.github.com/)
+    git lfs install
+
+    git clone https://huggingface.co/username/your-model-name
+
+When you have your local clone of your repo and lfs installed, you can then add/remove from that clone as you would
+with any other git repo.
+
+.. code-block:: bash
+
+    # Commit as usual
+    cd your-model-name
+    echo "hello" >> README.md
+    git add . && git commit -m "Update from $USER"
+
+We are intentionally not wrapping git too much, so that you can go on with the workflow you're used to and the tools
+you already know.
+
+The only learning curve you might have compared to regular git is the one for git-lfs. The documentation at
+`git-lfs.github.com <https://git-lfs.github.com/>`__ is decent, but we'll work on a tutorial with some tips and tricks
+in the coming weeks!
+
+Additionally, if you want to change multiple repos at once, the `change_config.py script
+<https://github.com/huggingface/efficient_scripts/blob/main/change_config.py>`__ can probably save you some time.

 Make your model work on all frameworks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -60,26 +122,24 @@ Make your model work on all frameworks

 You probably have your favorite framework, but so will other users! That's why it's best to upload your model with both
 PyTorch `and` TensorFlow checkpoints to make it easier to use (if you skip this step, users will still be able to load
-your model in another framework, but it will be slower, as it will have to be converted on the fly). Don't worry, it's super easy to do (and in a future version,
-it will all be automatic). You will need to install both PyTorch and TensorFlow for this step, but you don't need to
-worry about the GPU, so it should be very easy. Check the
-`TensorFlow installation page <https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available>`__ 
-and/or the `PyTorch installation page <https://pytorch.org/get-started/locally/#start-locally>`__ to see how.
+your model in another framework, but it will be slower, as it will have to be converted on the fly). Don't worry, it's
+super easy to do (and in a future version, it might all be automatic). You will need to install both PyTorch and
+TensorFlow for this step, but you don't need to worry about the GPU, so it should be very easy. Check the `TensorFlow
+installation page <https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available>`__ and/or the `PyTorch
+installation page <https://pytorch.org/get-started/locally/#start-locally>`__ to see how.

 First check that your model class exists in the other framework, that is try to import the same model by either adding
-or removing TF. For instance, if you trained a :class:`~transformers.DistilBertForSequenceClassification`, try to
-type
+or removing TF. For instance, if you trained a :class:`~transformers.DistilBertForSequenceClassification`, try to type

 .. code-block::

-    from transformers import TFDistilBertForSequenceClassification
+    >>> from transformers import TFDistilBertForSequenceClassification

-and if you trained a :class:`~transformers.TFDistilBertForSequenceClassification`, try to
-type
+and if you trained a :class:`~transformers.TFDistilBertForSequenceClassification`, try to type

 .. code-block::

-    from transformers import DistilBertForSequenceClassification
+    >>> from transformers import DistilBertForSequenceClassification

 This will give back an error if your model does not exist in the other framework (something that should be pretty rare
 since we're aiming for full parity between the two frameworks). In this case, skip this and go to the next step.
@@ -89,20 +149,20 @@ model class:

 .. code-block::

-    tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
-    tf_model.save_pretrained("path/to/awesome-name-you-picked")
+    >>> tf_model = TFDistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_pt=True)
+    >>> tf_model.save_pretrained("path/to/awesome-name-you-picked")

 and if you trained your model in TensorFlow and have to create a PyTorch version, adapt the following code to your
 model class:

 .. code-block::

-    pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
-    pt_model.save_pretrained("path/to/awesome-name-you-picked")
+    >>> pt_model = DistilBertForSequenceClassification.from_pretrained("path/to/awesome-name-you-picked", from_tf=True)
+    >>> pt_model.save_pretrained("path/to/awesome-name-you-picked")

 That's all there is to it!

-Check the directory before uploading
+Check the directory before pushing to the model hub.
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 Make sure there are no garbage files in the directory you'll upload. It should only have:
@@ -112,87 +172,71 @@ Make sure there are no garbage files in the directory you'll upload. It should o
 - a `tf_model.h5` file, which is the TensorFlow checkpoint (unless you can't have it for some reason) ;
 - a `special_tokens_map.json`, which is part of your :doc:`tokenizer <main_classes/tokenizer>` save;
 - a `tokenizer_config.json`, which is part of your :doc:`tokenizer <main_classes/tokenizer>` save;
- files named `vocab.json`, `vocab.txt`, `merges.txt`, or similar, which contain the vocabulary of your tokenizer, part of your :doc:`tokenizer <main_classes/tokenizer>` save;
+- files named `vocab.json`, `vocab.txt`, `merges.txt`, or similar, which contain the vocabulary of your tokenizer, part
+  of your :doc:`tokenizer <main_classes/tokenizer>` save;
 - maybe a `added_tokens.json`, which is part of your :doc:`tokenizer <main_classes/tokenizer>` save.

 Other files can safely be deleted.

-Upload your model with the CLI
+
+Uploading your files
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-Now go in a terminal and run the following command. It should be in the virtual enviromnent where you installed 🤗
-Transformers, since that command :obj:`transformers-cli` comes from the library.
+Once the repo is cloned, you can add the model, configuration and tokenizer files. For instance, saving the model and
+tokenizer files:

 .. code-block::

-    transformers-cli login
+    >>> model.save_pretrained("path/to/repo/clone/your-model-name")
+    >>> tokenizer.save_pretrained("path/to/repo/clone/your-model-name")

-Then log in using the same credentials as on huggingface.co. To upload your model, just type
+Or, if you're using the Trainer API

 .. code-block::

-    transformers-cli upload path/to/awesome-name-you-picked/
+    >>> trainer.save_model("path/to/awesome-name-you-picked")
+    >>> tokenizer.save_pretrained("path/to/repo/clone/your-model-name")

-This will upload the folder containing the weights, tokenizer and configuration we prepared in the previous section.
+You can then add these files to the staging environment and verify that they have been correctly staged with the ``git
+status`` command:

-By default you will be prompted to confirm that you want these files to be uploaded. If you are uploading multiple models and need to script that process, you can add `-y` to bypass the prompt. For example:
+.. code-block:: bash

-.. code-block::
+    git add --all
+    git status

-    transformers-cli upload -y path/to/awesome-name-you-picked/
+Finally, the files should be committed:

+.. code-block:: bash

-If you want to upload a single file (a new version of your model, or the other framework checkpoint you want to add),
-just type:
+    git commit -m "First version of the your-model-name model and tokenizer."

-.. code-block::
+And pushed to the remote:

-    transformers-cli upload path/to/awesome-name-you-picked/that-file 
+.. code-block:: bash

-or
+    git push

-.. code-block::
+This will upload the folder containing the weights, tokenizer and configuration we have just prepared.

-   transformers-cli upload path/to/awesome-name-you-picked/that-file --filename awesome-name-you-picked/new_name
-
-if you want to change its filename.
-
-This uploads the model to your personal account. If you want your model to be namespaced by your organization name
-rather than your username, add the following flag to any command:
-
-.. code-block::
-
-    --organization organization_name
-
-so for instance:
-
-.. code-block::
-
-    transformers-cli upload path/to/awesome-name-you-picked/ --organization organization_name
-
-Your model will then be accessible through its identifier, which is, as we saw above,
-`username/awesome-name-you-picked` or `organization/awesome-name-you-picked`.

 Add a model card
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-To make sure everyone knows what your model can do, what its limitations and potential bias or ethetical
-considerations, please add a README.md model card to the 🤗 Transformers repo under `model_cards/`. It should then be
-placed in a subfolder with your username or organization, then another subfolder named like your model
-(`awesome-name-you-picked`). Or just click on the "Create a model card on GitHub" button on the model page, it will
-get you directly to the right location. If you need one, `here <https://github.com/huggingface/model_card>`__ is a
-model card template (meta-suggestions are welcome).
+To make sure everyone knows what your model can do, what its limitations, potential bias or ethical considerations are,
+please add a README.md model card to your model repo. You can just create it, or there's also a convenient button
+titled "Add a README.md" on your model page. A model card template can be found `here
+<https://github.com/huggingface/model_card>`__ (meta-suggestions are welcome). model card template (meta-suggestions
+are welcome).
+
+.. note::
+
+    Model cards used to live in the 🤗 Transformers repo under `model_cards/`, but for consistency and scalability we
+    migrated every model card from the repo to its corresponding huggingface.co model repo.

 If your model is fine-tuned from another model coming from the model hub (all 🤗 Transformers pretrained models do),
 don't forget to link to its model card so that people can fully trace how your model was built.

-If you have never made a pull request to the 🤗 Transformers repo, look at the
-:doc:`contributing guide <contributing>` to see the steps to follow.
-
-.. Note::
-
-    You can also send your model card in the folder you uploaded with the CLI by placing it in a `README.md` file
-    inside `path/to/awesome-name-you-picked/`.

 Using your model
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -203,20 +247,61 @@ Anyone can load it from code:

 .. code-block::

-    tokenizer = AutoTokenizer.from_pretrained("namespace/awesome-name-you-picked")
-    model = AutoModel.from_pretrained("namespace/awesome-name-you-picked")
+    >>> tokenizer = AutoTokenizer.from_pretrained("namespace/awesome-name-you-picked")
+    >>> model = AutoModel.from_pretrained("namespace/awesome-name-you-picked")

-Additional commands
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-You can list all the files you uploaded on the hub like this:
+You may specify a revision by using the ``revision`` flag in the ``from_pretrained`` method:

 .. code-block::

-    transformers-cli s3 ls
+    >>> tokenizer = AutoTokenizer.from_pretrained(
+    >>>   "julien-c/EsperBERTo-small",
+    >>>   revision="v2.0.1" # tag name, or branch name, or commit hash
+    >>> )

-You can also delete unneeded files with
+Workflow in a Colab notebook
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

-.. code-block::
+If you're in a Colab notebook (or similar) with no direct access to a terminal, here is the workflow you can use to
+upload your model. You can execute each one of them in a cell by adding a ! at the beginning.

-    transformers-cli s3 rm awesome-name-you-picked/filename
+First you need to install `git-lfs` in the environment used by the notebook:
+
+.. code-block:: bash
+
+    sudo apt-get install git-lfs
+
+Then you can use either create a repo directly from `huggingface.co <https://huggingface.co/>`__ , or use the
+:obj:`transformers-cli` to create it:
+
+
+.. code-block:: bash
+
+    transformers-cli login
+    transformers-cli repo create your-model-name
+
+Once it's created, you can clone it and configure it (replace username by your username on huggingface.co):
+
+.. code-block:: bash
+
+    git lfs install
+
+    git clone https://username:password@huggingface.co/username/your-model-name
+    # Alternatively if you have a token,
+    # you can use it instead of your password
+    git clone https://username:token@huggingface.co/username/your-model-name
+
+    cd your-model-name
+    git config --global user.email "email@example.com"
+    # Tip: using the same email than for your huggingface.co account will link your commits to your profile
+    git config --global user.name "Your name"
+
+Once you've saved your model inside, and your clone is setup with the right remote URL, you can add it and push it with
+usual git commands.
+
+.. code-block:: bash
+
+    git add .
+    git commit -m "Initial commit"
+    git push
--- a/docs/source/model_summary.rst
+++ b/docs/source/model_summary.rst
@@ -1,12 +1,24 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Summary of the models
 =======================================================================================================================

-This is a summary of the models available in 🤗 Transformers. It assumes you’re familiar with the original
-`transformer model <https://arxiv.org/abs/1706.03762>`_. For a gentle introduction check the `annotated transformer
+This is a summary of the models available in 🤗 Transformers. It assumes you’re familiar with the original `transformer
+model <https://arxiv.org/abs/1706.03762>`_. For a gentle introduction check the `annotated transformer
 <http://nlp.seas.harvard.edu/2018/04/03/attention.html>`_. Here we focus on the high-level differences between the
-models. You can check them more in detail in their respective documentation. Also checkout the
-:doc:`pretrained model page </pretrained_models>` to see the checkpoints available for each type of model and all `the
-community models <https://huggingface.co/models>`_.
+models. You can check them more in detail in their respective documentation. Also check out the :doc:`pretrained model
+page </pretrained_models>` to see the checkpoints available for each type of model and all `the community models
+<https://huggingface.co/models>`_.

 Each one of the models in the library falls into one of the following categories:

@@ -18,9 +30,9 @@ Each one of the models in the library falls into one of the following categories

 Autoregressive models are pretrained on the classic language modeling task: guess the next token having read all the
 previous ones. They correspond to the decoder of the original transformer model, and a mask is used on top of the full
-sentence so that the attention heads can only see what was before in the next, and not what’s after. Although those
-models can be fine-tuned and achieve great results on many tasks, the most natural application is text generation.
-A typical example of such models is GPT.
+sentence so that the attention heads can only see what was before in the text, and not what’s after. Although those
+models can be fine-tuned and achieve great results on many tasks, the most natural application is text generation. A
+typical example of such models is GPT.

 Autoencoding models are pretrained by corrupting the input tokens in some way and trying to reconstruct the original
 sentence. They correspond to the encoder of the original transformer model in the sense that they get access to the
@@ -30,8 +42,8 @@ sentence classification or token classification. A typical example of such model

 Note that the only difference between autoregressive models and autoencoding models is in the way the model is
 pretrained. Therefore, the same architecture can be used for both autoregressive and autoencoding models. When a given
-model has been used for both types of pretraining, we have put it in the category corresponding to the article where it was first
-introduced.
+model has been used for both types of pretraining, we have put it in the category corresponding to the article where it
+was first introduced.

 Sequence-to-sequence models use both the encoder and the decoder of the original transformer, either for translation
 tasks or by transforming other tasks to sequence-to-sequence problems. They can be fine-tuned to many tasks but their
@@ -60,8 +72,8 @@ Original GPT
       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-openai--gpt-blueviolet">
   </a>

-`Improving Language Understanding by Generative Pre-Training <https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf>`_,
-Alec Radford et al.
+`Improving Language Understanding by Generative Pre-Training
+<https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf>`_, Alec Radford et al.

 The first autoregressive model based on the transformer architecture, pretrained on the Book Corpus dataset.

@@ -80,7 +92,8 @@ GPT-2
       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-gpt2-blueviolet">
   </a>

-`Language Models are Unsupervised Multitask Learners <https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_,
+`Language Models are Unsupervised Multitask Learners
+<https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf>`_,
 Alec Radford et al.

 A bigger and better version of GPT, pretrained on WebText (web pages from outgoing links in Reddit with 3 karmas or
@@ -122,8 +135,8 @@ Transformer-XL
       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-transfo--xl-blueviolet">
   </a>

-`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_,
-Zihang Dai et al.
+`Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <https://arxiv.org/abs/1901.02860>`_, Zihang
+Dai et al.

 Same as a regular GPT model, but introduces a recurrence mechanism for two consecutive segments (similar to a regular
 RNNs with two consecutive inputs). In this context, a segment is a number of consecutive tokens (for instance 512) that
@@ -153,8 +166,7 @@ Reformer
       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-reformer-blueviolet">
   </a>

-`Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_,
-Nikita Kitaev et al .
+`Reformer: The Efficient Transformer <https://arxiv.org/abs/2001.04451>`_, Nikita Kitaev et al .

 An autoregressive transformer model with lots of tricks to reduce memory footprint and compute time. Those tricks
 include:
@@ -188,8 +200,8 @@ XLNet
       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xlnet-blueviolet">
   </a>

-`XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_,
-Zhilin Yang et al.
+`XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_, Zhilin
+Yang et al.

 XLNet is not a traditional autoregressive model but uses a training strategy that builds on that. It permutes the
 tokens in the sentence, then allows the model to use the last n tokens to predict the token n+1. Since this is all done
@@ -207,7 +219,8 @@ Autoencoding models
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

 As mentioned before, these models rely on the encoder part of the original transformer and use no mask so the model can
-look at all the tokens in the attention heads. For pretraining, targets are the original sentences and inputs are their corrupted versions.
+look at all the tokens in the attention heads. For pretraining, targets are the original sentences and inputs are their
+corrupted versions.

 BERT
 -----------------------------------------------------------------------------------------------------------------------
@@ -260,8 +273,8 @@ Same as BERT but with a few tweaks:
    sequence of tokens) so it's more logical to have H >> E. Also, the embedding matrix is large since it's V x E (V
    being the vocab size). If E < H, it has less parameters.
  * Layers are split in groups that share parameters (to save memory).
-  * Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and B
-    (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have
+  * Next sentence prediction is replaced by a sentence ordering prediction: in the inputs, we have two sentences A and
+    B (that are consecutive) and we either feed A followed by B or B followed by A. The model must predict if they have
    been swapped or not.

 The library provides a version of the model for masked language modeling, token classification, sentence
@@ -279,8 +292,7 @@ RoBERTa
       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-roberta-blueviolet">
   </a>

-`RoBERTa: A Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_,
-Yinhan Liu et al.
+`RoBERTa: A Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_, Yinhan Liu et al.

 Same as BERT with better pretraining tricks:

@@ -339,8 +351,8 @@ library provides checkpoints for all of them:
    previous section as well). One of the languages is selected for each training sample, and the model input is a
    sentence of 256 tokens, that may span over several documents in one of those languages.
  * Masked language modeling (MLM) which is like RoBERTa. One of the languages is selected for each training sample,
-    and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages, with
-    dynamic masking of the tokens.
+    and the model input is a sentence of 256 tokens, that may span over several documents in one of those languages,
+    with dynamic masking of the tokens.
  * A combination of MLM and translation language modeling (TLM). This consists of concatenating a sentence in two
    different languages, with random masking. To predict one of the masked tokens, the model can use both, the
    surrounding context in language 1 and the context given by language 2.
@@ -500,8 +512,8 @@ BART
 <https://arxiv.org/abs/1910.13461>`_, Mike Lewis et al.

 Sequence-to-sequence model with an encoder and a decoder. Encoder is fed a corrupted version of the tokens, decoder is
-fed the original tokens (but has a mask to hide the future words like a regular transformers decoder). For the encoder, on the
-pretraining tasks, a composition of the following transformations are applied:
+fed the original tokens (but has a mask to hide the future words like a regular transformers decoder). A composition of
+the following transformations are applied on the pretraining tasks for the encoder:

  * mask random tokens (like in BERT)
  * delete random tokens
@@ -523,15 +535,21 @@ Pegasus
       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-pegasus-blueviolet">
   </a>

-`PEGASUS: Pre-training with Extracted Gap-sentences forAbstractive Summarization 
+`PEGASUS: Pre-training with Extracted Gap-sentences forAbstractive Summarization
 <https://arxiv.org/pdf/1912.08777.pdf>`_, Jingqing Zhang, Yao Zhao, Mohammad Saleh and Peter J. Liu on Dec 18, 2019.

-Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pre-training objective, called Gap Sentence Generation (GSG).
+Sequence-to-sequence model with the same encoder-decoder model architecture as BART. Pegasus is pre-trained jointly on
+two self-supervised objective functions: Masked Language Modeling (MLM) and a novel summarization specific pretraining
+objective, called Gap Sentence Generation (GSG).

-  * MLM: encoder input tokens are randomely replaced by a mask tokens and have to be predicted by the encoder (like in BERT)
-  * GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a causal mask to hide the future words like a regular auto-regressive transformer decoder.
+  * MLM: encoder input tokens are randomly replaced by a mask tokens and have to be predicted by the encoder (like in
+    BERT)
+  * GSG: whole encoder input sentences are replaced by a second mask token and fed to the decoder, but which has a
+    causal mask to hide the future words like a regular auto-regressive transformer decoder.

-In contrast to BART, Pegasus' pretraining task is intentionally similar to summarization: important sentences are masked and are generated together as one output sequence from the remaining sentences, similar to an extractive summary.
+In contrast to BART, Pegasus' pretraining task is intentionally similar to summarization: important sentences are
+masked and are generated together as one output sequence from the remaining sentences, similar to an extractive
+summary.

 The library provides a version of this model for conditional generation, which should be used for summarization.

@@ -554,6 +572,7 @@ A framework for translation models, using the same models as BART

 The library provides a version of this model for conditional generation.

+
 T5
 -----------------------------------------------------------------------------------------------------------------------

@@ -566,25 +585,48 @@ T5
       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-t5-blueviolet">
   </a>

-`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer <https://arxiv.org/abs/1910.10683>`_,
-Colin Raffel et al.
+`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer
+<https://arxiv.org/abs/1910.10683>`_, Colin Raffel et al.

-Uses the traditional transformer model (with a slight change in the positional embeddings, which are learned at
-each layer). To be able to operate on all NLP tasks, it transforms them into text-to-text problems by using specific
+Uses the traditional transformer model (with a slight change in the positional embeddings, which are learned at each
+layer). To be able to operate on all NLP tasks, it transforms them into text-to-text problems by using specific
 prefixes: “summarize: ”, “question: ”, “translate English to German: ” and so forth.

 The pretraining includes both supervised and self-supervised training. Supervised training is conducted on downstream
 tasks provided by the GLUE and SuperGLUE benchmarks (converting them into text-to-text tasks as explained above).

-Self-supervised training uses corrupted tokens, by randomly removing 15% of the tokens and
-replacing them with individual sentinel tokens (if several consecutive tokens are marked for removal, the whole group is replaced with a single sentinel token). The input of the encoder is the corrupted sentence, the input of the decoder is the
-original sentence and the target is then the dropped out tokens delimited by their sentinel tokens.
+Self-supervised training uses corrupted tokens, by randomly removing 15% of the tokens and replacing them with
+individual sentinel tokens (if several consecutive tokens are marked for removal, the whole group is replaced with a
+single sentinel token). The input of the encoder is the corrupted sentence, the input of the decoder is the original
+sentence and the target is then the dropped out tokens delimited by their sentinel tokens.

-For instance, if we have the sentence “My dog is very cute .”, and we decide to remove the tokens: "dog", "is" and "cute", the encoder
-input becomes “My <x> very <y> .” and the target input becomes “<x> dog is <y> cute .<z>”
+For instance, if we have the sentence “My dog is very cute .”, and we decide to remove the tokens: "dog", "is" and
+"cute", the encoder input becomes “My <x> very <y> .” and the target input becomes “<x> dog is <y> cute .<z>”

 The library provides a version of this model for conditional generation.

+
+MT5
+-----------------------------------------------------------------------------------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=mt5">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-mt5-blueviolet">
+   </a>
+   <a href="model_doc/mt5.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-mt5-blueviolet">
+   </a>
+
+`mT5: A massively multilingual pre-trained text-to-text transformer <https://arxiv.org/abs/2010.11934>`_, Linting Xue
+et al.
+
+The model architecture is same as T5. mT5's pretraining objective includes T5's self-supervised training, but not T5's
+supervised training. mT5 is trained on 101 languages.
+
+The library provides a version of this model for conditional generation.
+
+
 MBart
 -----------------------------------------------------------------------------------------------------------------------

@@ -597,18 +639,67 @@ MBart
       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-mbart-blueviolet">
   </a>

-`Multilingual Denoising Pre-training for Neural Machine Translation <https://arxiv.org/abs/2001.08210>`_ by Yinhan Liu, Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov
-Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.
+`Multilingual Denoising Pre-training for Neural Machine Translation <https://arxiv.org/abs/2001.08210>`_ by Yinhan Liu,
+Jiatao Gu, Naman Goyal, Xian Li, Sergey Edunov Marjan Ghazvininejad, Mike Lewis, Luke Zettlemoyer.

-The model architecture and pre-training objective is same as BART, but MBart is trained on 25 languages 
-and is intended for supervised and unsupervised machine translation. MBart is one of the first methods 
-for pre-training a complete sequence-to-sequence model by denoising full texts in multiple languages,
+The model architecture and pretraining objective is same as BART, but MBart is trained on 25 languages and is intended
+for supervised and unsupervised machine translation. MBart is one of the first methods for pretraining a complete
+sequence-to-sequence model by denoising full texts in multiple languages,

 The library provides a version of this model for conditional generation.

-The `mbart-large-en-ro checkpoint <https://huggingface.co/facebook/mbart-large-en-ro>`_ can be used for english -> romanian translation.
+The `mbart-large-en-ro checkpoint <https://huggingface.co/facebook/mbart-large-en-ro>`_ can be used for english ->
+romanian translation.

-The `mbart-large-cc25 <https://huggingface.co/facebook/mbart-large-cc25>`_ checkpoint can be finetuned for other translation and summarization tasks, using code in ```examples/seq2seq/``` , but is not very useful without finetuning.
+The `mbart-large-cc25 <https://huggingface.co/facebook/mbart-large-cc25>`_ checkpoint can be finetuned for other
+translation and summarization tasks, using code in ```examples/seq2seq/``` , but is not very useful without finetuning.
+
+
+ProphetNet
+-----------------------------------------------------------------------------------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=prophetnet">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-prophetnet-blueviolet">
+   </a>
+   <a href="model_doc/prophetnet.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-prophetnet-blueviolet">
+   </a>
+
+`ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by
+Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.
+
+ProphetNet introduces a novel *sequence-to-sequence* pretraining objective, called *future n-gram prediction*. In
+future n-gram prediction, the model predicts the next n tokens simultaneously based on previous context tokens at each
+time step instead instead of just the single next token. The future n-gram prediction explicitly encourages the model
+to plan for the future tokens and prevent overfitting on strong local correlations. The model architecture is based on
+the original Transformer, but replaces the "standard" self-attention mechanism in the decoder by a a main
+self-attention mechanism and a self and n-stream (predict) self-attention mechanism.
+
+The library provides a pre-trained version of this model for conditional generation and a fine-tuned version for
+summarization.
+
+XLM-ProphetNet
+-----------------------------------------------------------------------------------------------------------------------
+
+.. raw:: html
+
+   <a href="https://huggingface.co/models?filter=xprophetnet">
+       <img alt="Models" src="https://img.shields.io/badge/All_model_pages-xprophetnet-blueviolet">
+   </a>
+   <a href="model_doc/xlmprophetnet.html">
+       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-xprophetnet-blueviolet">
+   </a>
+
+`ProphetNet: Predicting Future N-gram for Sequence-to-Sequence Pre-training, <https://arxiv.org/abs/2001.04063>`__ by
+Yu Yan, Weizhen Qi, Yeyun Gong, Dayiheng Liu, Nan Duan, Jiusheng Chen, Ruofei Zhang, Ming Zhou.
+
+XLM-ProphetNet's model architecture and pretraining objective is same as ProphetNet, but XLM-ProphetNet was pre-trained
+on the cross-lingual dataset `XGLUE <https://arxiv.org/abs/2004.01401>`__.
+
+The library provides a pre-trained version of this model for multi-lingual conditional generation and fine-tuned
+versions for headline generation and question generation, respectively.

 .. _multimodal-models:

@@ -626,8 +717,8 @@ et al.

 A transformers model used in multimodal settings, combining a text and an image to make predictions. The transformer
 model takes as inputs the embeddings of the tokenized text and the final activations of a pretrained on images resnet
-(after the pooling layer) that goes through a linear layer (to go from number of features at the end of the
-resnet to the hidden state dimension of the transformer).
+(after the pooling layer) that goes through a linear layer (to go from number of features at the end of the resnet to
+the hidden state dimension of the transformer).

 The different inputs are concatenated, and on top of the positional embeddings, a segment embedding is added to let the
 model know which part of the input vector corresponds to the text and which to the image.
@@ -635,8 +726,7 @@ model know which part of the input vector corresponds to the text and which to t
 The pretrained model only works for classification.

 ..
-    More information in this :doc:`model documentation </model_doc/mmbt.html>`.
-    TODO: write this page
+    More information in this :doc:`model documentation </model_doc/mmbt.html>`. TODO: write this page

 .. _retrieval-based-models:

@@ -658,19 +748,22 @@ DPR
       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-dpr-blueviolet">
   </a>

-`Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`_,
-Vladimir Karpukhin et al.
+`Dense Passage Retrieval for Open-Domain Question Answering <https://arxiv.org/abs/2004.04906>`_, Vladimir Karpukhin et
+al.

-Dense Passage Retrieval (DPR) - is a set of tools and models for state-of-the-art open-domain question-answering research.
+Dense Passage Retrieval (DPR) - is a set of tools and models for state-of-the-art open-domain question-answering
+research.


 DPR consists in three models:

  * Question encoder: encode questions as vectors
  * Context encoder: encode contexts as vectors
-  * Reader: extract the answer of the questions inside retrieved contexts, along with a relevance score (high if the inferred span actually answers the question).
+  * Reader: extract the answer of the questions inside retrieved contexts, along with a relevance score (high if the
+    inferred span actually answers the question).

-DPR's pipeline (not implemented yet) uses a retrieval step to find the top k contexts given a certain question, and then it calls the reader with the question and the retrieved documents to get the answer.
+DPR's pipeline (not implemented yet) uses a retrieval step to find the top k contexts given a certain question, and
+then it calls the reader with the question and the retrieved documents to get the answer.

 RAG
 -----------------------------------------------------------------------------------------------------------------------
@@ -684,12 +777,14 @@ RAG
       <img alt="Doc" src="https://img.shields.io/badge/Model_documentation-rag-blueviolet">
   </a>

-`Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks <https://arxiv.org/abs/2005.11401>`_,
-Patrick Lewis, Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela
+`Retrieval-Augmented Generation for Knowledge-Intensive NLP Tasks <https://arxiv.org/abs/2005.11401>`_, Patrick Lewis,
+Ethan Perez, Aleksandara Piktus, Fabio Petroni, Vladimir Karpukhin, Naman Goyal, Heinrich Küttler, Mike Lewis, Wen-tau
+Yih, Tim Rocktäschel, Sebastian Riedel, Douwe Kiela

-Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and Seq2Seq models.
-RAG models retrieve docs, pass them to a seq2seq model, then marginalize to generate outputs.
-The retriever and seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing both retrieval and generation to adapt to downstream tasks.
+Retrieval-augmented generation ("RAG") models combine the powers of pretrained dense retrieval (DPR) and Seq2Seq
+models. RAG models retrieve docs, pass them to a seq2seq model, then marginalize to generate outputs. The retriever and
+seq2seq modules are initialized from pretrained models, and fine-tuned jointly, allowing both retrieval and generation
+to adapt to downstream tasks.

 The two models RAG-Token and RAG-Sequence are available for generation.

@@ -708,19 +803,19 @@ use a sparse version of the attention matrix to speed up training.
 **LSH attention**

 :ref:`Reformer <reformer>` uses LSH attention. In the softmax(QK^t), only the biggest elements (in the softmax
-dimension) of the matrix QK^t are going to give useful contributions. So for each query q in Q, we can  consider only
+dimension) of the matrix QK^t are going to give useful contributions. So for each query q in Q, we can consider only
 the keys k in K that are close to q. A hash function is used to determine if q and k are close. The attention mask is
-modified to mask the current token (except at the first position), because it will give a query and a key equal (so very
-similar to each other). Since the hash can be a bit random, several hash functions are used in practice (determined by
-a n_rounds parameter) and then are averaged together.
+modified to mask the current token (except at the first position), because it will give a query and a key equal (so
+very similar to each other). Since the hash can be a bit random, several hash functions are used in practice
+(determined by a n_rounds parameter) and then are averaged together.

 .. _local-attention:

 **Local attention**

-:ref:`Longformer <longformer>` uses local attention: often, the local context (e.g., what are the two tokens to the left and
-right?) is enough to take action for a given token. Also, by stacking attention layers that have a small window, the
-last layer will have a receptive field of more than just the tokens in the window, allowing them to build a
+:ref:`Longformer <longformer>` uses local attention: often, the local context (e.g., what are the two tokens to the
+left and right?) is enough to take action for a given token. Also, by stacking attention layers that have a small
+window, the last layer will have a receptive field of more than just the tokens in the window, allowing them to build a
 representation of the whole sentence.

 Some preselected input tokens are also given global attention: for those few tokens, the attention matrix can access
@@ -743,8 +838,9 @@ Other tricks

 :ref:`Reformer <reformer>` uses axial positional encodings: in traditional transformer models, the positional encoding
 E is a matrix of size :math:`l` by :math:`d`, :math:`l` being the sequence length and :math:`d` the dimension of the
-hidden state. If you have very long texts, this matrix can be huge and take way too much space on the GPU. To alleviate that, axial positional encodings consist of factorizing that big matrix E in two smaller matrices E1 and
-E2, with dimensions :math:`l_{1} \times d_{1}` and :math:`l_{2} \times d_{2}`, such that :math:`l_{1} \times l_{2} = l`
-and :math:`d_{1} + d_{2} = d` (with the product for the lengths, this ends up being way smaller). The embedding for
-time step :math:`j` in E is obtained by concatenating the embeddings for timestep :math:`j \% l1` in E1 and
-:math:`j // l1` in E2.
+hidden state. If you have very long texts, this matrix can be huge and take way too much space on the GPU. To alleviate
+that, axial positional encodings consist of factorizing that big matrix E in two smaller matrices E1 and E2, with
+dimensions :math:`l_{1} \times d_{1}` and :math:`l_{2} \times d_{2}`, such that :math:`l_{1} \times l_{2} = l` and
+:math:`d_{1} + d_{2} = d` (with the product for the lengths, this ends up being way smaller). The embedding for time
+step :math:`j` in E is obtained by concatenating the embeddings for timestep :math:`j \% l1` in E1 and :math:`j // l1`
+in E2.
--- a/docs/source/multilingual.rst
+++ b/docs/source/multilingual.rst
@@ -1,9 +1,21 @@
+.. 
+    Copyright 2020 The HuggingFace Team. All rights reserved.
+
+    Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+    the License. You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+    Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+    an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+    specific language governing permissions and limitations under the License.
+
 Multi-lingual models
 =======================================================================================================================

-Most of the models available in this library are mono-lingual models (English, Chinese and German). A few
-multi-lingual models are available and have a different mechanisms than mono-lingual models.
-This page details the usage of these models.
+Most of the models available in this library are mono-lingual models (English, Chinese and German). A few multi-lingual
+models are available and have a different mechanisms than mono-lingual models. This page details the usage of these
+models.

 The two models that currently support multiple languages are BERT and XLM.

@@ -28,8 +40,8 @@ This section concerns the following checkpoints:

 These checkpoints require language embeddings that will specify the language used at inference time. These language
 embeddings are represented as a tensor that is of the same shape as the input ids passed to the model. The values in
-these tensors depend on the language used and are identifiable using the ``lang2id`` and ``id2lang`` attributes
-from the tokenizer.
+these tensors depend on the language used and are identifiable using the ``lang2id`` and ``id2lang`` attributes from
+the tokenizer.

 Here is an example using the ``xlm-clm-enfr-1024`` checkpoint (Causal language modeling, English-French):

@@ -78,8 +90,8 @@ You can then feed it all as input to your model:
    >>> outputs = model(input_ids, langs=langs)


-The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/text-generation/run_generation.py>`__
-can generate text using the CLM checkpoints from XLM, using the language embeddings.
+The example :prefix_link:`run_generation.py <examples/text-generation/run_generation.py>` can generate text using the
+CLM checkpoints from XLM, using the language embeddings.

 XLM without Language Embeddings
 -----------------------------------------------------------------------------------------------------------------------
@@ -89,8 +101,8 @@ This section concerns the following checkpoints:
 - ``xlm-mlm-17-1280`` (Masked language modeling, 17 languages)
 - ``xlm-mlm-100-1280`` (Masked language modeling, 100 languages)

-These checkpoints do not require language embeddings at inference time. These models are used to have generic
-sentence representations, differently from previously-mentioned XLM checkpoints.
+These checkpoints do not require language embeddings at inference time. These models are used to have generic sentence
+representations, differently from previously-mentioned XLM checkpoints.


 BERT
@@ -101,15 +113,15 @@ BERT has two checkpoints that can be used for multi-lingual tasks:
 - ``bert-base-multilingual-uncased`` (Masked language modeling + Next sentence prediction, 102 languages)
 - ``bert-base-multilingual-cased`` (Masked language modeling + Next sentence prediction, 104 languages)

-These checkpoints do not require language embeddings at inference time. They should identify the language
-used in the context and infer accordingly.
+These checkpoints do not require language embeddings at inference time. They should identify the language used in the
+context and infer accordingly.

 XLM-RoBERTa
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

-XLM-RoBERTa was trained on 2.5TB of newly created clean CommonCrawl data in 100 languages. It provides strong
-gains over previously released multi-lingual models like mBERT or XLM on downstream taks like classification,
-sequence labeling and question answering.
+XLM-RoBERTa was trained on 2.5TB of newly created clean CommonCrawl data in 100 languages. It provides strong gains
+over previously released multi-lingual models like mBERT or XLM on downstream tasks like classification, sequence
+labeling and question answering.

 Two XLM-RoBERTa checkpoints can be used for multi-lingual tasks:

--- a/Show More
+++ b/Show More
				`@@ -0,0 +1 @@`
				`$PYTHON setup.py install # Python command to install the script.`