Merge branch 'xlnet'

2019-07-16 11:51:13 +02:00
parent 78462aad61 1b35d05d4b
commit f31154cb9d
136 changed files with 16407 additions and 10024 deletions
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,28 +1,30 @@
 version: 2
 jobs:
    build_py3:
-        working_directory: ~/pytorch-pretrained-BERT
+        working_directory: ~/pytorch-transformers
        docker:
            - image: circleci/python:3.5
+        resource_class: large
+        parallelism: 4
        steps:
            - checkout
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo pip install spacy ftfy==4.4.3
-            - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/ --cov
+            - run: sudo pip install tensorboardX scikit-learn
+            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
+            - run: python -m pytest -sv ./examples/
            - run: codecov
    build_py2:
-        working_directory: ~/pytorch-pretrained-BERT
+        working_directory: ~/pytorch-transformers
+        resource_class: large
+        parallelism: 4
        docker:
            - image: circleci/python:2.7
        steps:
            - checkout
            - run: sudo pip install --progress-bar off .
            - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo pip install spacy ftfy==4.4.3
-            - run: sudo python -m spacy download en
-            - run: python -m pytest -sv tests/ --cov
+            - run: python -m pytest -sv ./pytorch_transformers/tests/ --cov
            - run: codecov
 workflows:
  version: 2
--- a/.coveragerc
+++ b/.coveragerc
@@ -1,5 +1,9 @@
 [run]
-source=pytorch_pretrained_bert
+source=pytorch_transformers
+omit =
+    # skip convertion scripts from testing for now
+    */convert_*
+    */__main__.py
 [report]
 exclude_lines =
    pragma: no cover
--- a/.gitignore
+++ b/.gitignore
@@ -123,3 +123,8 @@ tensorflow_code

 # Models
 models
+proc_data
+
+# examples
+runs
+examples/runs
--- a/README.md
+++ b/README.md
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,6 +2,6 @@ FROM pytorch/pytorch:latest

 RUN git clone https://github.com/NVIDIA/apex.git && cd apex && python setup.py install --cuda_ext --cpp_ext

-RUN pip install pytorch-pretrained-bert
+RUN pip install pytorch_transformers

 WORKDIR /workspace
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
--- a/docs/README.md
+++ b/docs/README.md
@@ -0,0 +1,60 @@
+# Generating the documentation
+
+To generate the documentation, you first have to build it. Several packages are necessary to build the doc,
+you can install them using:
+
+```bash
+pip install -r requirements.txt
+```
+ 
+## Packages installed
+
+Here's an overview of all the packages installed. If you ran the previous command installing all packages from 
+`requirements.txt`, you do not need to run the following commands.
+
+Building it requires the package `sphinx` that you can 
+install using:
+
+```bash
+pip install -U sphinx
+```
+
+You would also need the custom installed [theme](https://github.com/readthedocs/sphinx_rtd_theme) by 
+[Read The Docs](https://readthedocs.org/). You can install it using the following command:
+
+```bash
+pip install sphinx_rtd_theme
+```
+
+The third necessary package is the `recommonmark` package to accept Markdown as well as Restructured text:
+
+```bash
+pip install recommonmark
+```
+
+## Building the documentation
+
+Once you have setup `sphinx`, you can build the documentation by running the following command in the `/docs` folder:
+
+```bash
+make html
+```
+
+---
+**NOTE**
+
+If you are adding/removing elements from the toc-tree or from any strutural item, it is recommended to clean the build
+directory before rebuilding. Run the following command to clean and build:
+
+```bash
+make clean && make html
+```
+
+---
+
+It should build the static app that will be available under `/docs/_build/html`
+
+## Adding a new element to the tree (toc-tree)
+
+Accepted files are reStructuredText (.rst) and Markdown (.md). Create a file with its extension and put it
+in the source directory. You can then link it to the toc-tree by putting the filename without the extension.
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -0,0 +1,28 @@
+alabaster==0.7.12
+Babel==2.7.0
+certifi==2019.6.16
+chardet==3.0.4
+commonmark==0.9.0
+docutils==0.14
+future==0.17.1
+idna==2.8
+imagesize==1.1.0
+Jinja2==2.10.1
+MarkupSafe==1.1.1
+packaging==19.0
+Pygments==2.4.2
+pyparsing==2.4.0
+pytz==2019.1
+recommonmark==0.5.0
+requests==2.22.0
+six==1.12.0
+snowballstemmer==1.9.0
+Sphinx==2.1.2
+sphinx-rtd-theme==0.4.3
+sphinxcontrib-applehelp==1.0.1
+sphinxcontrib-devhelp==1.0.1
+sphinxcontrib-htmlhelp==1.0.2
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.2
+sphinxcontrib-serializinghtml==1.1.3
+urllib3==1.25.3
--- a/docs/source/_static/css/Calibre-Light.ttf
+++ b/docs/source/_static/css/Calibre-Light.ttf
--- a/docs/source/_static/css/Calibre-Medium.otf
+++ b/docs/source/_static/css/Calibre-Medium.otf
--- a/docs/source/_static/css/Calibre-Regular.otf
+++ b/docs/source/_static/css/Calibre-Regular.otf
--- a/docs/source/_static/css/Calibre-Thin.otf
+++ b/docs/source/_static/css/Calibre-Thin.otf
--- a/docs/source/_static/css/code-snippets.css
+++ b/docs/source/_static/css/code-snippets.css
@@ -0,0 +1,12 @@
+
+.highlight .c1, .highlight .sd{
+    color: #999
+}
+
+.highlight .nn, .highlight .k, .highlight .s1, .highlight .nb, .highlight .bp, .highlight .kc {
+    color: #FB8D68;
+}
+
+.highlight .kn, .highlight .nv, .highlight .s2, .highlight .ow {
+    color: #6670FF;
+}
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -0,0 +1,199 @@
+huggingface.css
+
+/* The literal code blocks */
+.rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
+    color: #6670FF;
+}
+
+/* To keep the logo centered */
+.wy-side-scroll {
+    width: auto;
+    font-size: 20px;
+}
+
+/* The div that holds the Hugging Face logo */
+.HuggingFaceDiv {
+    width: 100%
+}
+
+/* The research field on top of the toc tree */
+.wy-side-nav-search{
+    background-color: #6670FF;
+}
+
+/* The toc tree */
+.wy-nav-side{
+    background-color: #6670FF;
+}
+
+/* The selected items in the toc tree */
+.wy-menu-vertical li.current{
+    background-color: #A6B0FF;
+}
+
+/* When a list item that does belong to the selected block from the toc tree is hovered */
+.wy-menu-vertical li.current a:hover{
+    background-color: #B6C0FF;
+}
+
+/* When a list item that does NOT belong to the selected block from the toc tree is hovered. */
+.wy-menu-vertical li a:hover{
+    background-color: #A7AFFB;
+}
+
+/* The text items on the toc tree */
+.wy-menu-vertical a {
+    color: #FFFFDD;
+    font-family: Calibre-Light;
+}
+.wy-menu-vertical header, .wy-menu-vertical p.caption{
+    color: white;
+    font-family: Calibre-Light;
+}
+
+/* The color inside the selected toc tree block */
+.wy-menu-vertical li.toctree-l2 a, .wy-menu-vertical li.toctree-l3 a, .wy-menu-vertical li.toctree-l4 a {
+    color: black;
+}
+
+/* Inside the depth-2 selected toc tree block */
+.wy-menu-vertical li.toctree-l2.current>a {
+    background-color: #B6C0FF
+}
+.wy-menu-vertical li.toctree-l2.current li.toctree-l3>a {
+    background-color: #C6D0FF
+}
+
+/* Inside the depth-3 selected toc tree block */
+.wy-menu-vertical li.toctree-l3.current li.toctree-l4>a{
+    background-color: #D6E0FF
+}
+
+/* Inside code snippets */
+.rst-content dl:not(.docutils) dt{
+    font-size: 15px;
+}
+
+/* Links */
+a {
+    color: #6670FF;
+}
+
+/* Content bars */
+.rst-content dl:not(.docutils) dt {
+    background-color: rgba(251, 141, 104, 0.1);
+    border-right: solid 2px #FB8D68;
+    border-left: solid 2px #FB8D68;
+    color: #FB8D68;
+    font-family: Calibre-Light;
+    border-top: none;
+    font-style: normal !important;
+}
+
+/* Expand button */
+.wy-menu-vertical li.toctree-l2 span.toctree-expand,
+.wy-menu-vertical li.on a span.toctree-expand, .wy-menu-vertical li.current>a span.toctree-expand,
+.wy-menu-vertical li.toctree-l3 span.toctree-expand{
+    color: black;
+}
+
+/* Max window size */
+.wy-nav-content{
+    max-width: 1200px;
+}
+
+/* Mobile header */
+.wy-nav-top{
+    background-color: #6670FF;
+}
+
+
+/* Source spans */
+.rst-content .viewcode-link, .rst-content .viewcode-back{
+    color: #6670FF;
+    font-size: 110%;
+    letter-spacing: 2px;
+    text-transform: uppercase;
+}
+
+/* It would be better for table to be visible without horizontal scrolling */
+.wy-table-responsive table td, .wy-table-responsive table th{
+    white-space: normal;
+}
+
+.footer {
+    margin-top: 20px;
+}
+
+.footer__Social {
+    display: flex;
+    flex-direction: row;
+}
+
+.footer__CustomImage {
+    margin: 2px 5px 0 0;
+}
+
+/* class and method names in doc */
+.rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) code.descclassname{
+    font-family: Calibre;
+    font-size: 20px !important;
+}
+
+/* class name in doc*/
+.rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname{
+    margin-right: 10px;
+    font-family: Calibre-Medium;
+}
+
+/* Method and class parameters */
+.sig-param{
+    line-height: 23px;
+}
+
+/* Class introduction "class" string at beginning */
+.rst-content dl:not(.docutils) .property{
+    font-size: 18px;
+    color: black;
+}
+
+
+/* FONTS */
+body{
+    font-family: Calibre;
+    font-size: 16px;
+}
+
+h1 {
+    font-family: Calibre-Thin;
+    font-size: 70px;
+}
+
+h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
+    font-family: Calibre-Medium;
+}
+
+@font-face {
+    font-family: Calibre-Medium;
+    src: url(./Calibre-Medium.otf);
+    font-weight:400;
+}
+
+@font-face {
+    font-family: Calibre;
+    src: url(./Calibre-Regular.otf);
+    font-weight:400;
+}
+
+@font-face {
+    font-family: Calibre-Light;
+    src: url(./Calibre-Light.ttf);
+    font-weight:400;
+}
+
+@font-face {
+    font-family: Calibre-Thin;
+    src: url(./Calibre-Thin.otf);
+    font-weight:400;
+}
+
--- a/docs/source/_static/js/custom.js
+++ b/docs/source/_static/js/custom.js
@@ -0,0 +1,54 @@
+function addIcon() {
+    const huggingFaceLogo = "http://lysand.re/huggingface_logo.svg";
+    const image = document.createElement("img");
+    image.setAttribute("src", huggingFaceLogo);
+
+    const div = document.createElement("div");
+    div.appendChild(image);
+    div.style.textAlign = 'center';
+    div.style.paddingTop = '30px';
+    div.style.backgroundColor = '#6670FF';
+
+    const scrollDiv = document.getElementsByClassName("wy-side-scroll")[0];
+    scrollDiv.prepend(div);
+}
+
+function addCustomFooter() {
+    const customFooter = document.createElement("div");
+    const questionOrIssue = document.createElement("div");
+    questionOrIssue.innerHTML = "Stuck? Read our <a href='https://medium.com/huggingface'>Blog posts</a> or <a href='https://github.com/huggingface/pytorch_transformers'>Create an issue</a>";
+    customFooter.appendChild(questionOrIssue);
+    customFooter.classList.add("footer");
+
+    const social = document.createElement("div");
+    social.classList.add("footer__Social");
+
+    const imageDetails = [
+        { link: "https://huggingface.co", imageLink: "http://lysand.re/icons/website.svg" },
+        { link: "https://twitter.com/huggingface", imageLink: "http://lysand.re/icons/twitter.svg" },
+        { link: "https://github.com/huggingface", imageLink: "http://lysand.re/icons/github.svg" },
+        { link: "https://www.linkedin.com/company/huggingface/", imageLink: "http://lysand.re/icons/linkedin.svg" }
+    ];
+
+    imageDetails.forEach(imageLinks => {
+        const link = document.createElement("a");
+        const image = document.createElement("img");
+        image.src = imageLinks.imageLink;
+        link.href = imageLinks.link;
+        image.style.width = "30px";
+        image.classList.add("footer__CustomImage");
+        link.appendChild(image);
+        social.appendChild(link);
+    });
+
+    customFooter.appendChild(social);
+    document.getElementsByTagName("footer")[0].appendChild(customFooter);
+}
+
+function onLoad() {
+    addIcon();
+    addCustomFooter();
+}
+
+window.addEventListener("load", onLoad);
+
--- a/docs/source/_static/js/huggingface_logo.svg
+++ b/docs/source/_static/js/huggingface_logo.svg
@@ -0,0 +1,47 @@
+<svg width="95px" height="88px" viewBox="0 0 95 88" version="1.1" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+    <!-- Generator: Sketch 43.2 (39069) - http://www.bohemiancoding.com/sketch -->
+    <title>icon</title>
+    <desc>Created with Sketch.</desc>
+    <defs>
+        <path d="M13,14.7890193 C22.8284801,14.7890193 26,6.02605902 26,1.5261751 C26,-0.812484109 24.4279133,-0.0763570998 21.9099482,1.17020987 C19.5830216,2.32219957 16.4482998,3.91011313 13,3.91011313 C5.82029825,3.91011313 0,-2.97370882 0,1.5261751 C0,6.02605902 3.17151989,14.7890193 13,14.7890193 Z" id="path-1"></path>
+    </defs>
+    <g id="Page-1" stroke="none" stroke-width="1" fill="none" fill-rule="evenodd">
+        <g id="icon_desktop">
+            <g id="icon">
+                <g id="icon_desktop">
+                    <g id="Group-2">
+                        <g id="Group">
+                            <path d="M93.7930402,70.08 C94.5430402,72.24 94.3630402,74.54 93.3630402,76.54 C92.6430402,78 91.6130402,79.13 90.3530402,80.14 C88.8330402,81.34 86.9430402,82.36 84.6630402,83.34 C81.9430402,84.5 78.6230402,85.59 77.1030402,85.99 C73.2130402,87 69.4730402,87.64 65.6830402,87.67 C60.2630402,87.72 55.5930402,86.44 52.2730402,83.17 C50.5530402,83.38 48.8130402,83.5 47.0630402,83.5 C45.4030402,83.5 43.7630402,83.4 42.1330402,83.2 C38.8030402,86.45 34.1530402,87.72 28.7530402,87.67 C24.9630402,87.64 21.2230402,87 17.3230402,85.99 C15.8130402,85.59 12.4930402,84.5 9.77304019,83.34 C7.49304019,82.36 5.60304019,81.34 4.09304019,80.14 C2.82304019,79.13 1.79304019,78 1.07304019,76.54 C0.0830401858,74.54 -0.106959814,72.24 0.653040186,70.08 C-0.0469598142,68.43 -0.226959814,66.54 0.323040186,64.45 C0.573040186,63.5 0.983040186,62.62 1.50304019,61.84 C1.39304019,61.43 1.30304019,61.01 1.24304019,60.55 C0.863040186,57.81 1.81304019,55.31 3.60304019,53.37 C4.48304019,52.4 5.43304019,51.73 6.42304019,51.3 C5.69304019,48.2 5.31304019,45.01 5.31304019,41.75 C5.31304019,18.69 24.0030402,0 47.0630402,0 C54.9830402,0 62.3930402,2.2 68.7130402,6.04 C69.8530402,6.74 70.9730402,7.49 72.0430402,8.29 C72.5730402,8.69 73.1030402,9.1 73.6130402,9.53 C74.1330402,9.95 74.6430402,10.39 75.1330402,10.84 C76.6130402,12.19 78.0030402,13.64 79.2730402,15.19 C79.7030402,15.7 80.1130402,16.23 80.5130402,16.77 C81.3230402,17.84 82.0730402,18.95 82.7630402,20.1 C83.8130402,21.82 84.7330402,23.62 85.5330402,25.49 C86.0630402,26.74 86.5230402,28.02 86.9330402,29.33 C87.5430402,31.29 88.0130402,33.31 88.3330402,35.39 C88.4330402,36.08 88.5230402,36.78 88.5930402,37.48 C88.7330402,38.88 88.8130402,40.3 88.8130402,41.75 C88.8130402,44.97 88.4330402,48.13 87.7230402,51.18 C88.8230402,51.61 89.8630402,52.31 90.8330402,53.37 C92.6230402,55.31 93.5730402,57.82 93.1930402,60.56 C93.1330402,61.01 93.0430402,61.43 92.9330402,61.84 C93.4530402,62.62 93.8630402,63.5 94.1130402,64.45 C94.6630402,66.54 94.4830402,68.43 93.7930402,70.08" id="Fill-1" fill="#FFFFFF" fill-rule="nonzero"></path>
+                            <circle id="Oval" fill="#FFD21E" fill-rule="nonzero" cx="46.75" cy="41.75" r="34.75"></circle>
+                            <path d="M81.5,41.75 C81.5,22.5581049 65.9418951,7 46.75,7 C27.5581049,7 12,22.5581049 12,41.75 C12,60.9418951 27.5581049,76.5 46.75,76.5 C65.9418951,76.5 81.5,60.9418951 81.5,41.75 Z M8,41.75 C8,20.3489659 25.3489659,3 46.75,3 C68.1510341,3 85.5,20.3489659 85.5,41.75 C85.5,63.1510341 68.1510341,80.5 46.75,80.5 C25.3489659,80.5 8,63.1510341 8,41.75 Z" id="Oval" fill="#FFAC03" fill-rule="nonzero"></path>
+                            <path d="M57.1723547,31.7151181 C58.0863134,32.7107502 57.3040427,35.2620959 58.7620957,35.2620959 C61.5235194,35.2620959 63.7620957,33.0235196 63.7620957,30.2620959 C63.7620957,27.5006721 61.5235194,25.2620959 58.7620957,25.2620959 C56.0006719,25.2620959 53.7620957,27.5006721 53.7620957,30.2620959 C53.7620957,31.5654666 56.3553563,30.8251108 57.1723547,31.7151181 Z" id="Oval-2" fill="#3A3B45" fill-rule="nonzero" transform="translate(58.762096, 30.262096) rotate(-28.000000) translate(-58.762096, -30.262096) "></path>
+                            <path d="M32.1723553,31.7151181 C33.086314,32.7107502 32.3040433,35.2620959 33.7620963,35.2620959 C36.52352,35.2620959 38.7620963,33.0235196 38.7620963,30.2620959 C38.7620963,27.5006721 36.52352,25.2620959 33.7620963,25.2620959 C31.0006725,25.2620959 28.7620963,27.5006721 28.7620963,30.2620959 C28.7620963,31.5654666 31.3553569,30.8251108 32.1723553,31.7151181 Z" id="Oval-2" fill="#3A3B45" fill-rule="nonzero" transform="translate(33.762096, 30.262096) scale(-1, 1) rotate(-28.000000) translate(-33.762096, -30.262096) "></path>
+                            <g id="Oval-4" transform="translate(33.500000, 41.500000)">
+                                <g id="Mask" fill-rule="nonzero" fill="#3A3B45">
+                                    <path d="M13,14.7890193 C22.8284801,14.7890193 26,6.02605902 26,1.5261751 C26,-0.812484109 24.4279133,-0.0763570998 21.9099482,1.17020987 C19.5830216,2.32219957 16.4482998,3.91011313 13,3.91011313 C5.82029825,3.91011313 0,-2.97370882 0,1.5261751 C0,6.02605902 3.17151989,14.7890193 13,14.7890193 Z" id="path-1"></path>
+                                </g>
+                                <g id="Clipped">
+                                    <mask id="mask-2" fill="white">
+                                        <use xlink:href="#path-1"></use>
+                                    </mask>
+                                    <g id="path-1"></g>
+                                    <path d="M13.25,25 C18.0399291,25 21.9229338,21.1169953 21.9229338,16.3270662 C21.9229338,12.5962324 19.5672252,9.41560375 16.2620987,8.19147116 C16.1404592,8.14641904 16.0175337,8.10401696 15.8933923,8.06433503 C15.0599892,7.79793679 14.1717882,10.6623144 13.25,10.6623144 C12.3886883,10.6623144 11.5567012,7.77968641 10.7713426,8.01349068 C7.18916268,9.07991937 4.57706621,12.3984489 4.57706621,16.3270662 C4.57706621,21.1169953 8.46007093,25 13.25,25 Z" id="Shape" fill="#EF4E4E" fill-rule="nonzero" mask="url(#mask-2)"></path>
+                                </g>
+                            </g>
+                            <circle id="Oval-3" fill="#FFD21E" fill-rule="nonzero" style="mix-blend-mode: multiply;" cx="70.25" cy="33.75" r="3.25"></circle>
+                            <circle id="Oval-3" fill="#FFD21E" fill-rule="nonzero" style="mix-blend-mode: multiply;" cx="23.75" cy="33.75" r="3.25"></circle>
+                        </g>
+                    </g>
+                </g>
+                <g id="Group-4" transform="translate(3.000000, 48.000000)" fill-rule="nonzero">
+                    <path d="M14.0619453,0 L14.0619453,0 C12.4429453,0 10.9959453,0.665 9.98694534,1.871 C9.36294534,2.618 8.71094534,3.822 8.65794534,5.625 C7.97894534,5.43 7.32594534,5.321 6.71594534,5.321 C5.16594534,5.321 3.76594534,5.915 2.77594534,6.994 C1.50394534,8.379 0.938945345,10.081 1.18494534,11.784 C1.30194534,12.595 1.57294534,13.322 1.97794534,13.995 C1.12394534,14.686 0.494945345,15.648 0.190945345,16.805 C-0.0470546551,17.712 -0.291054655,19.601 0.982945345,21.547 C0.901945345,21.674 0.825945345,21.806 0.754945345,21.941 C-0.0110546551,23.395 -0.0600546551,25.038 0.615945345,26.568 C1.64094534,28.887 4.18794534,30.714 9.13394534,32.675 C12.2109453,33.895 15.0259453,34.675 15.0509453,34.682 C19.1189453,35.737 22.7979453,36.273 25.9829453,36.273 C31.8369453,36.273 36.0279453,34.48 38.4399453,30.944 C42.3219453,25.25 41.7669453,20.042 36.7439453,15.022 C33.9639453,12.244 32.1159453,8.148 31.7309453,7.249 C30.9549453,4.587 28.9029453,1.628 25.4919453,1.628 L25.4909453,1.628 C25.2039453,1.628 24.9139453,1.651 24.6279453,1.696 C23.1339453,1.931 21.8279453,2.791 20.8949453,4.085 C19.8879453,2.833 18.9099453,1.837 18.0249453,1.275 C16.6909453,0.429 15.3579453,0 14.0619453,0 M14.0619453,4 C14.5719453,4 15.1949453,4.217 15.8819453,4.653 C18.0149453,6.006 22.1309453,13.081 23.6379453,15.833 C24.1429453,16.755 25.0059453,17.145 25.7829453,17.145 C27.3249453,17.145 28.5289453,15.612 25.9239453,13.664 C22.0069453,10.733 23.3809453,5.942 25.2509453,5.647 C25.3329453,5.634 25.4139453,5.628 25.4919453,5.628 C27.1919453,5.628 27.9419453,8.558 27.9419453,8.558 C27.9419453,8.558 30.1399453,14.078 33.9159453,17.851 C37.6919453,21.625 37.8869453,24.654 35.1349453,28.69 C33.2579453,31.442 29.6649453,32.273 25.9829453,32.273 C22.1639453,32.273 18.2489453,31.379 16.0549453,30.81 C15.9469453,30.782 2.60394534,27.013 4.29394534,23.805 C4.57794534,23.266 5.04594534,23.05 5.63494534,23.05 C8.01494534,23.05 12.3439453,26.592 14.2049453,26.592 C14.6209453,26.592 14.9139453,26.415 15.0339453,25.983 C15.8269453,23.138 2.97694534,21.942 4.05994534,17.821 C4.25094534,17.092 4.76894534,16.796 5.49694534,16.797 C8.64194534,16.797 15.6979453,22.328 17.1769453,22.328 C17.2899453,22.328 17.3709453,22.295 17.4149453,22.225 C18.1559453,21.029 17.7499453,20.194 12.5269453,17.033 C7.30394534,13.871 3.63794534,11.969 5.72294534,9.699 C5.96294534,9.437 6.30294534,9.321 6.71594534,9.321 C9.88694534,9.322 17.3789453,16.14 17.3789453,16.14 C17.3789453,16.14 19.4009453,18.243 20.6239453,18.243 C20.9049453,18.243 21.1439453,18.132 21.3059453,17.858 C22.1729453,16.396 13.2529453,9.636 12.7499453,6.847 C12.4089453,4.957 12.9889453,4 14.0619453,4" id="Fill-1" fill="#FFAC03"></path>
+                    <path d="M35.1348,28.6899 C37.8868,24.6539 37.6918,21.6249 33.9158,17.8509 C30.1398,14.0779 27.9418,8.5579 27.9418,8.5579 C27.9418,8.5579 27.1208,5.3519 25.2508,5.6469 C23.3808,5.9419 22.0078,10.7329 25.9248,13.6639 C29.8418,16.5939 25.1448,18.5849 23.6378,15.8329 C22.1308,13.0809 18.0158,6.0059 15.8818,4.6529 C13.7488,3.2999 12.2468,4.0579 12.7498,6.8469 C13.2528,9.6359 22.1738,16.3959 21.3058,17.8589 C20.4378,19.3209 17.3788,16.1399 17.3788,16.1399 C17.3788,16.1399 7.8068,7.4289 5.7228,9.6989 C3.6388,11.9689 7.3038,13.8709 12.5268,17.0329 C17.7508,20.1939 18.1558,21.0289 17.4148,22.2249 C16.6728,23.4209 5.1428,13.6999 4.0598,17.8209 C2.9778,21.9419 15.8268,23.1379 15.0338,25.9829 C14.2408,28.8289 5.9828,20.5979 4.2938,23.8049 C2.6038,27.0129 15.9468,30.7819 16.0548,30.8099 C20.3648,31.9279 31.3108,34.2969 35.1348,28.6899" id="Fill-4" fill="#FFD21E"></path>
+                </g>
+                <g id="Group-4" transform="translate(70.500000, 66.500000) scale(-1, 1) translate(-70.500000, -66.500000) translate(50.000000, 48.000000)" fill-rule="nonzero">
+                    <path d="M14.0619453,0 L14.0619453,0 C12.4429453,0 10.9959453,0.665 9.98694534,1.871 C9.36294534,2.618 8.71094534,3.822 8.65794534,5.625 C7.97894534,5.43 7.32594534,5.321 6.71594534,5.321 C5.16594534,5.321 3.76594534,5.915 2.77594534,6.994 C1.50394534,8.379 0.938945345,10.081 1.18494534,11.784 C1.30194534,12.595 1.57294534,13.322 1.97794534,13.995 C1.12394534,14.686 0.494945345,15.648 0.190945345,16.805 C-0.0470546551,17.712 -0.291054655,19.601 0.982945345,21.547 C0.901945345,21.674 0.825945345,21.806 0.754945345,21.941 C-0.0110546551,23.395 -0.0600546551,25.038 0.615945345,26.568 C1.64094534,28.887 4.18794534,30.714 9.13394534,32.675 C12.2109453,33.895 15.0259453,34.675 15.0509453,34.682 C19.1189453,35.737 22.7979453,36.273 25.9829453,36.273 C31.8369453,36.273 36.0279453,34.48 38.4399453,30.944 C42.3219453,25.25 41.7669453,20.042 36.7439453,15.022 C33.9639453,12.244 32.1159453,8.148 31.7309453,7.249 C30.9549453,4.587 28.9029453,1.628 25.4919453,1.628 L25.4909453,1.628 C25.2039453,1.628 24.9139453,1.651 24.6279453,1.696 C23.1339453,1.931 21.8279453,2.791 20.8949453,4.085 C19.8879453,2.833 18.9099453,1.837 18.0249453,1.275 C16.6909453,0.429 15.3579453,0 14.0619453,0 M14.0619453,4 C14.5719453,4 15.1949453,4.217 15.8819453,4.653 C18.0149453,6.006 22.1309453,13.081 23.6379453,15.833 C24.1429453,16.755 25.0059453,17.145 25.7829453,17.145 C27.3249453,17.145 28.5289453,15.612 25.9239453,13.664 C22.0069453,10.733 23.3809453,5.942 25.2509453,5.647 C25.3329453,5.634 25.4139453,5.628 25.4919453,5.628 C27.1919453,5.628 27.9419453,8.558 27.9419453,8.558 C27.9419453,8.558 30.1399453,14.078 33.9159453,17.851 C37.6919453,21.625 37.8869453,24.654 35.1349453,28.69 C33.2579453,31.442 29.6649453,32.273 25.9829453,32.273 C22.1639453,32.273 18.2489453,31.379 16.0549453,30.81 C15.9469453,30.782 2.60394534,27.013 4.29394534,23.805 C4.57794534,23.266 5.04594534,23.05 5.63494534,23.05 C8.01494534,23.05 12.3439453,26.592 14.2049453,26.592 C14.6209453,26.592 14.9139453,26.415 15.0339453,25.983 C15.8269453,23.138 2.97694534,21.942 4.05994534,17.821 C4.25094534,17.092 4.76894534,16.796 5.49694534,16.797 C8.64194534,16.797 15.6979453,22.328 17.1769453,22.328 C17.2899453,22.328 17.3709453,22.295 17.4149453,22.225 C18.1559453,21.029 17.7499453,20.194 12.5269453,17.033 C7.30394534,13.871 3.63794534,11.969 5.72294534,9.699 C5.96294534,9.437 6.30294534,9.321 6.71594534,9.321 C9.88694534,9.322 17.3789453,16.14 17.3789453,16.14 C17.3789453,16.14 19.4009453,18.243 20.6239453,18.243 C20.9049453,18.243 21.1439453,18.132 21.3059453,17.858 C22.1729453,16.396 13.2529453,9.636 12.7499453,6.847 C12.4089453,4.957 12.9889453,4 14.0619453,4" id="Fill-1" fill="#FFAC03"></path>
+                    <path d="M35.1348,28.6899 C37.8868,24.6539 37.6918,21.6249 33.9158,17.8509 C30.1398,14.0779 27.9418,8.5579 27.9418,8.5579 C27.9418,8.5579 27.1208,5.3519 25.2508,5.6469 C23.3808,5.9419 22.0078,10.7329 25.9248,13.6639 C29.8418,16.5939 25.1448,18.5849 23.6378,15.8329 C22.1308,13.0809 18.0158,6.0059 15.8818,4.6529 C13.7488,3.2999 12.2468,4.0579 12.7498,6.8469 C13.2528,9.6359 22.1738,16.3959 21.3058,17.8589 C20.4378,19.3209 17.3788,16.1399 17.3788,16.1399 C17.3788,16.1399 7.8068,7.4289 5.7228,9.6989 C3.6388,11.9689 7.3038,13.8709 12.5268,17.0329 C17.7508,20.1939 18.1558,21.0289 17.4148,22.2249 C16.6728,23.4209 5.1428,13.6999 4.0598,17.8209 C2.9778,21.9419 15.8268,23.1379 15.0338,25.9829 C14.2408,28.8289 5.9828,20.5979 4.2938,23.8049 C2.6038,27.0129 15.9468,30.7819 16.0548,30.8099 C20.3648,31.9279 31.3108,34.2969 35.1348,28.6899" id="Fill-4" fill="#FFD21E"></path>
+                </g>
+            </g>
+        </g>
+    </g>
+</svg>
--- a/docs/source/bertology.rst
+++ b/docs/source/bertology.rst
@@ -0,0 +1,18 @@
+BERTology
+---------
+
+There is a growing field of study concerned with investigating the inner working of large-scale transformers like BERT (that some call "BERTology"). Some good examples of this field are:
+
+
+* BERT Rediscovers the Classical NLP Pipeline by Ian Tenney, Dipanjan Das, Ellie Pavlick: https://arxiv.org/abs/1905.05950
+* Are Sixteen Heads Really Better than One? by Paul Michel, Omer Levy, Graham Neubig: https://arxiv.org/abs/1905.10650
+* What Does BERT Look At? An Analysis of BERT's Attention by Kevin Clark, Urvashi Khandelwal, Omer Levy, Christopher D. Manning: https://arxiv.org/abs/1906.04341
+
+In order to help this new field develop, we have included a few additional features in the BERT/GPT/GPT-2 models to help people access the inner representations, mainly adapted  from the great work of Paul Michel (https://arxiv.org/abs/1905.10650):
+
+
+* accessing all the hidden-states of BERT/GPT/GPT-2,
+* accessing all the attention weights for each head of BERT/GPT/GPT-2,
+* retrieving heads output values and gradients to be able to compute head importance score and prune head as explained in https://arxiv.org/abs/1905.10650.
+
+To help you understand and use these features, we have added a specific example script: `bertology.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/bertology.py>`_ while extract information and prune a model pre-trained on MRPC.
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -0,0 +1,187 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+
+# -- Path setup --------------------------------------------------------------
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../..'))
+
+
+# -- Project information -----------------------------------------------------
+
+project = u'pytorch-transformers'
+copyright = u'2019, huggingface'
+author = u'huggingface'
+
+# The short X.Y version
+version = u''
+# The full version, including alpha/beta/rc tags
+release = u'1.0.0'
+
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+    'sphinx.ext.autodoc',
+    'sphinx.ext.coverage',
+    'sphinx.ext.napoleon',
+    'recommonmark',
+    'sphinx.ext.viewcode'
+]
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+source_suffix = ['.rst', '.md']
+# source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = [u'_build', 'Thumbs.db', '.DS_Store']
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  See the documentation for
+# a list of builtin themes.
+#
+html_theme = 'sphinx_rtd_theme'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#
+html_theme_options = {
+    'analytics_id': 'UA-83738774-2'
+}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself.  Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'pytorch-transformersdoc'
+
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #
+    # 'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #
+    # 'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #
+    # 'preamble': '',
+
+    # Latex figure (float) alignment
+    #
+    # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+#  author, documentclass [howto, manual, or own class]).
+latex_documents = [
+    (master_doc, 'pytorch-transformers.tex', u'pytorch-transformers Documentation',
+     u'huggingface', 'manual'),
+]
+
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+    (master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
+     [author], 1)
+]
+
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+#  dir menu entry, description, category)
+texinfo_documents = [
+    (master_doc, 'pytorch-transformers', u'pytorch-transformers Documentation',
+     author, 'pytorch-transformers', 'One line description of project.',
+     'Miscellaneous'),
+]
+
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+def setup(app):
+    app.add_stylesheet('css/huggingface.css')
+    app.add_stylesheet('css/code-snippets.css')
+    app.add_js_file('js/custom.js')
+
+# -- Extension configuration -------------------------------------------------
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -0,0 +1,86 @@
+Converting Tensorflow Models
+================================================
+
+A command-line interface is provided to convert a TensorFlow checkpoint in a PyTorch dump of the ``BertForPreTraining`` class  (for BERT) or NumPy checkpoint in a PyTorch dump of the ``OpenAIGPTModel`` class  (for OpenAI GPT).
+
+BERT
+^^^^
+
+You can convert any TensorFlow checkpoint for BERT (in particular `the pre-trained models released by Google <https://github.com/google-research/bert#pre-trained-models>`_\ ) in a PyTorch save file by using the `convert_tf_checkpoint_to_pytorch.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py>`_ script.
+
+This CLI takes as input a TensorFlow checkpoint (three files starting with ``bert_model.ckpt``\ ) and the associated configuration file (\ ``bert_config.json``\ ), and creates a PyTorch model for this configuration, loads the weights from the TensorFlow checkpoint in the PyTorch model and saves the resulting model in a standard PyTorch save file that can be imported using ``torch.load()`` (see examples in `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_extract_features.py>`_\ , `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ ).
+
+You only need to run this conversion script **once** to get a PyTorch model. You can then disregard the TensorFlow checkpoint (the three files starting with ``bert_model.ckpt``\ ) but be sure to keep the configuration file (\ ``bert_config.json``\ ) and the vocabulary file (\ ``vocab.txt``\ ) as these are needed for the PyTorch model too.
+
+To run this specific conversion script you will need to have TensorFlow and PyTorch installed (\ ``pip install tensorflow``\ ). The rest of the repository only requires PyTorch.
+
+Here is an example of the conversion process for a pre-trained ``BERT-Base Uncased`` model:
+
+.. code-block:: shell
+
+   export BERT_BASE_DIR=/path/to/bert/uncased_L-12_H-768_A-12
+
+   pytorch_transformers bert \
+     $BERT_BASE_DIR/bert_model.ckpt \
+     $BERT_BASE_DIR/bert_config.json \
+     $BERT_BASE_DIR/pytorch_model.bin
+
+You can download Google's pre-trained models for the conversion `here <https://github.com/google-research/bert#pre-trained-models>`__.
+
+OpenAI GPT
+^^^^^^^^^^
+
+Here is an example of the conversion process for a pre-trained OpenAI GPT model, assuming that your NumPy checkpoint save as the same format than OpenAI pretrained model (see `here <https://github.com/openai/finetune-transformer-lm>`__\ )
+
+.. code-block:: shell
+
+   export OPENAI_GPT_CHECKPOINT_FOLDER_PATH=/path/to/openai/pretrained/numpy/weights
+
+   pytorch_transformers gpt \
+     $OPENAI_GPT_CHECKPOINT_FOLDER_PATH \
+     $PYTORCH_DUMP_OUTPUT \
+     [OPENAI_GPT_CONFIG]
+
+Transformer-XL
+^^^^^^^^^^^^^^
+
+Here is an example of the conversion process for a pre-trained Transformer-XL model (see `here <https://github.com/kimiyoung/transformer-xl/tree/master/tf#obtain-and-evaluate-pretrained-sota-models>`__\ )
+
+.. code-block:: shell
+
+   export TRANSFO_XL_CHECKPOINT_FOLDER_PATH=/path/to/transfo/xl/checkpoint
+
+   pytorch_transformers transfo_xl \
+     $TRANSFO_XL_CHECKPOINT_FOLDER_PATH \
+     $PYTORCH_DUMP_OUTPUT \
+     [TRANSFO_XL_CONFIG]
+
+GPT-2
+^^^^^
+
+Here is an example of the conversion process for a pre-trained OpenAI's GPT-2 model.
+
+.. code-block:: shell
+
+   export GPT2_DIR=/path/to/gpt2/checkpoint
+
+   pytorch_transformers gpt2 \
+     $GPT2_DIR/model.ckpt \
+     $PYTORCH_DUMP_OUTPUT \
+     [GPT2_CONFIG]
+
+XLNet
+^^^^^
+
+Here is an example of the conversion process for a pre-trained XLNet model, fine-tuned on STS-B using the TensorFlow script:
+
+.. code-block:: shell
+
+   export TRANSFO_XL_CHECKPOINT_PATH=/path/to/xlnet/checkpoint
+   export TRANSFO_XL_CONFIG_PATH=/path/to/xlnet/config
+
+   pytorch_transformers xlnet \
+     $TRANSFO_XL_CHECKPOINT_PATH \
+     $TRANSFO_XL_CONFIG_PATH \
+     $PYTORCH_DUMP_OUTPUT \
+     STS-B \
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -0,0 +1,639 @@
+examples.rst
+
+Examples
+================================================
+
+.. list-table::
+   :header-rows: 1
+
+   * - Sub-section
+     - Description
+   * - `Training large models: introduction, tools and examples <#introduction>`_
+     - How to use gradient-accumulation, multi-gpu training, distributed training, optimize on CPU and 16-bits training to train Bert models
+   * - `Fine-tuning with BERT: running the examples <#fine-tuning-bert-examples>`_
+     - Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``extract_classif.py``\ , ``run_bert_classifier.py``\ , ``run_bert_squad.py`` and ``run_lm_finetuning.py``
+   * - `Fine-tuning with OpenAI GPT, Transformer-XL and GPT-2 <#fine-tuning>`_
+     - Running the examples in `examples <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples>`_\ : ``run_openai_gpt.py``\ , ``run_transfo_xl.py`` and ``run_gpt2.py``
+   * - `Fine-tuning BERT-large on GPUs <#fine-tuning-bert-large>`_
+     - How to fine tune ``BERT large``
+
+
+.. _introduction:
+
+Training large models: introduction, tools and examples
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+BERT-base and BERT-large are respectively 110M and 340M parameters models and it can be difficult to fine-tune them on a single GPU with the recommended batch size for good performance (in most case a batch size of 32).
+
+To help with fine-tuning these models, we have included several techniques that you can activate in the fine-tuning scripts `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_classifier.py>`_ and `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/run_bert_squad.py>`_\ : gradient-accumulation, multi-gpu training, distributed training and 16-bits training . For more details on how to use these techniques you can read `the tips on training large batches in PyTorch <https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_ that I published earlier this year.
+
+Here is how to use these techniques in our scripts:
+
+
+* **Gradient Accumulation**\ : Gradient accumulation can be used by supplying a integer greater than 1 to the ``--gradient_accumulation_steps`` argument. The batch at each step will be divided by this integer and gradient will be accumulated over ``gradient_accumulation_steps`` steps.
+* **Multi-GPU**\ : Multi-GPU is automatically activated when several GPUs are detected and the batches are splitted over the GPUs.
+* **Distributed training**\ : Distributed training can be activated by supplying an integer greater or equal to 0 to the ``--local_rank`` argument (see below).
+* **16-bits training**\ : 16-bits training, also called mixed-precision training, can reduce the memory requirement of your model on the GPU by using half-precision training, basically allowing to double the batch size. If you have a recent GPU (starting from NVIDIA Volta architecture) you should see no decrease in speed. A good introduction to Mixed precision training can be found `here <https://devblogs.nvidia.com/mixed-precision-training-deep-neural-networks/>`__ and a full documentation is `here <https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html>`__. In our scripts, this option can be activated by setting the ``--fp16`` flag and you can play with loss scaling using the ``--loss_scale`` flag (see the previously linked documentation for details on loss scaling). The loss scale can be zero in which case the scale is dynamically adjusted or a positive power of two in which case the scaling is static.
+
+To use 16-bits training and distributed training, you need to install NVIDIA's apex extension `as detailed here <https://github.com/nvidia/apex>`__. You will find more information regarding the internals of ``apex`` and how to use ``apex`` in `the doc and the associated repository <https://github.com/nvidia/apex>`_. The results of the tests performed on pytorch-BERT by the NVIDIA team (and my trials at reproducing them) can be consulted in `the relevant PR of the present repository <https://github.com/huggingface/pytorch-pretrained-BERT/pull/116>`_.
+
+Note: To use *Distributed Training*\ , you will need to run one training script on each of your machines. This can be done for example by running the following command on each server (see `the above mentioned blog post <https://medium.com/huggingface/training-larger-batches-practical-tips-on-1-gpu-multi-gpu-distributed-setups-ec88c3e51255>`_\ ) for more details):
+
+.. code-block:: bash
+
+    python -m torch.distributed.launch \
+        --nproc_per_node=4 \
+        --nnodes=2 \
+        --node_rank=$THIS_MACHINE_INDEX \
+        --master_addr="192.168.1.1" \
+        --master_port=1234 run_bert_classifier.py \
+        (--arg1 --arg2 --arg3 and all other arguments of the run_classifier script)
+
+Where ``$THIS_MACHINE_INDEX`` is an sequential index assigned to each of your machine (0, 1, 2...) and the machine with rank 0 has an IP address ``192.168.1.1`` and an open port ``1234``.
+
+.. _fine-tuning-bert-examples:
+
+Fine-tuning with BERT: running the examples
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We showcase several fine-tuning examples based on (and extended from) `the original implementation <https://github.com/google-research/bert/>`_\ :
+
+
+* a *sequence-level classifier* on nine different GLUE tasks,
+* a *token-level classifier* on the question answering dataset SQuAD, and
+* a *sequence-level multiple-choice classifier* on the SWAG classification corpus.
+* a *BERT language model* on another target corpus
+
+GLUE results on dev set
+~~~~~~~~~~~~~~~~~~~~~~~
+
+We get the following results on the dev set of GLUE benchmark with an uncased BERT base
+model. All experiments were run on a P100 GPU with a batch size of 32.
+
+.. list-table::
+   :header-rows: 1
+
+   * - Task
+     - Metric
+     - Result
+   * - CoLA
+     - Matthew's corr.
+     - 57.29
+   * - SST-2
+     - accuracy
+     - 93.00
+   * - MRPC
+     - F1/accuracy
+     - 88.85/83.82
+   * - STS-B
+     - Pearson/Spearman corr.
+     - 89.70/89.37
+   * - QQP
+     - accuracy/F1
+     - 90.72/87.41
+   * - MNLI
+     - matched acc./mismatched acc.
+     - 83.95/84.39
+   * - QNLI
+     - accuracy
+     - 89.04
+   * - RTE
+     - accuracy
+     - 61.01
+   * - WNLI
+     - accuracy
+     - 53.52
+
+
+Some of these results are significantly different from the ones reported on the test set
+of GLUE benchmark on the website. For QQP and WNLI, please refer to `FAQ #12 <https://gluebenchmark.com/faq>`_ on the webite.
+
+Before running anyone of these GLUE tasks you should download the
+`GLUE data <https://gluebenchmark.com/tasks>`_ by running
+`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
+and unpack it to some directory ``$GLUE_DIR``.
+
+.. code-block:: shell
+
+   export GLUE_DIR=/path/to/glue
+   export TASK_NAME=MRPC
+
+   python run_bert_classifier.py \
+     --task_name $TASK_NAME \
+     --do_train \
+     --do_eval \
+     --do_lower_case \
+     --data_dir $GLUE_DIR/$TASK_NAME \
+     --bert_model bert-base-uncased \
+     --max_seq_length 128 \
+     --train_batch_size 32 \
+     --learning_rate 2e-5 \
+     --num_train_epochs 3.0 \
+     --output_dir /tmp/$TASK_NAME/
+
+where task name can be one of CoLA, SST-2, MRPC, STS-B, QQP, MNLI, QNLI, RTE, WNLI.
+
+The dev set results will be present within the text file 'eval_results.txt' in the specified output_dir. In case of MNLI, since there are two separate dev sets, matched and mismatched, there will be a separate output folder called '/tmp/MNLI-MM/' in addition to '/tmp/MNLI/'.
+
+The code has not been tested with half-precision training with apex on any GLUE task apart from MRPC, MNLI, CoLA, SST-2. The following section provides details on how to run half-precision training with MRPC. With that being said, there shouldn't be any issues in running half-precision training with the remaining GLUE tasks as well, since the data processor for each task inherits from the base class DataProcessor.
+
+MRPC
+~~~~
+
+This example code fine-tunes BERT on the Microsoft Research Paraphrase
+Corpus (MRPC) corpus and runs in less than 10 minutes on a single K-80 and in 27 seconds (!) on single tesla V100 16GB with apex installed.
+
+Before running this example you should download the
+`GLUE data <https://gluebenchmark.com/tasks>`_ by running
+`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
+and unpack it to some directory ``$GLUE_DIR``.
+
+.. code-block:: shell
+
+   export GLUE_DIR=/path/to/glue
+
+   python run_bert_classifier.py \
+     --task_name MRPC \
+     --do_train \
+     --do_eval \
+     --do_lower_case \
+     --data_dir $GLUE_DIR/MRPC/ \
+     --bert_model bert-base-uncased \
+     --max_seq_length 128 \
+     --train_batch_size 32 \
+     --learning_rate 2e-5 \
+     --num_train_epochs 3.0 \
+     --output_dir /tmp/mrpc_output/
+
+Our test ran on a few seeds with `the original implementation hyper-parameters <https://github.com/google-research/bert#sentence-and-sentence-pair-classification-tasks>`__ gave evaluation results between 84% and 88%.
+
+**Fast run with apex and 16 bit precision: fine-tuning on MRPC in 27 seconds!**
+First install apex as indicated `here <https://github.com/NVIDIA/apex>`__.
+Then run
+
+.. code-block:: shell
+
+   export GLUE_DIR=/path/to/glue
+
+   python run_bert_classifier.py \
+     --task_name MRPC \
+     --do_train \
+     --do_eval \
+     --do_lower_case \
+     --data_dir $GLUE_DIR/MRPC/ \
+     --bert_model bert-base-uncased \
+     --max_seq_length 128 \
+     --train_batch_size 32 \
+     --learning_rate 2e-5 \
+     --num_train_epochs 3.0 \
+     --output_dir /tmp/mrpc_output/ \
+     --fp16
+
+**Distributed training**
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking model to reach a F1 > 92 on MRPC:
+
+.. code-block:: bash
+
+    python -m torch.distributed.launch \
+        --nproc_per_node 8 run_bert_classifier.py \
+        --bert_model bert-large-uncased-whole-word-masking \
+        --task_name MRPC \
+        --do_train \
+        --do_eval \
+        --do_lower_case \
+        --data_dir $GLUE_DIR/MRPC/ \
+        --max_seq_length 128 \
+        --train_batch_size 8 \
+        --learning_rate 2e-5 \
+        --num_train_epochs 3.0 \
+         --output_dir /tmp/mrpc_output/
+
+Training with these hyper-parameters gave us the following results:
+
+.. code-block:: bash
+
+     acc = 0.8823529411764706
+     acc_and_f1 = 0.901702786377709
+     eval_loss = 0.3418912578906332
+     f1 = 0.9210526315789473
+     global_step = 174
+     loss = 0.07231863956341798
+
+Here is an example on MNLI:
+
+.. code-block:: bash
+
+    python -m torch.distributed.launch \
+        --nproc_per_node 8 run_bert_classifier.py \
+        --bert_model bert-large-uncased-whole-word-masking \
+        --task_name mnli \
+        --do_train \
+        --do_eval \
+        --do_lower_case \
+        --data_dir /datadrive/bert_data/glue_data//MNLI/ \
+        --max_seq_length 128 \
+        --train_batch_size 8 \
+        --learning_rate 2e-5 \
+        --num_train_epochs 3.0 \
+        --output_dir ../models/wwm-uncased-finetuned-mnli/ \
+        --overwrite_output_dir
+
+.. code-block:: bash
+
+   ***** Eval results *****
+     acc = 0.8679706601466992
+     eval_loss = 0.4911287787382479
+     global_step = 18408
+     loss = 0.04755385363816904
+
+   ***** Eval results *****
+     acc = 0.8747965825874695
+     eval_loss = 0.45516540421714036
+     global_step = 18408
+     loss = 0.04755385363816904
+
+This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model
+
+SQuAD
+~~~~~
+
+This example code fine-tunes BERT on the SQuAD dataset. It runs in 24 min (with BERT-base) or 68 min (with BERT-large) on a single tesla V100 16GB.
+
+The data for SQuAD can be downloaded with the following links and should be saved in a ``$SQUAD_DIR`` directory.
+
+
+* `train-v1.1.json <https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json>`_
+* `dev-v1.1.json <https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json>`_
+* `evaluate-v1.1.py <https://github.com/allenai/bi-att-flow/blob/master/squad/evaluate-v1.1.py>`_
+
+.. code-block:: shell
+
+   export SQUAD_DIR=/path/to/SQUAD
+
+   python run_bert_squad.py \
+     --bert_model bert-base-uncased \
+     --do_train \
+     --do_predict \
+     --do_lower_case \
+     --train_file $SQUAD_DIR/train-v1.1.json \
+     --predict_file $SQUAD_DIR/dev-v1.1.json \
+     --train_batch_size 12 \
+     --learning_rate 3e-5 \
+     --num_train_epochs 2.0 \
+     --max_seq_length 384 \
+     --doc_stride 128 \
+     --output_dir /tmp/debug_squad/
+
+Training with the previous hyper-parameters gave us the following results:
+
+.. code-block:: bash
+
+   python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json /tmp/debug_squad/predictions.json
+   {"f1": 88.52381567990474, "exact_match": 81.22043519394512}
+
+**distributed training**
+
+Here is an example using distributed training on 8 V100 GPUs and Bert Whole Word Masking uncased model to reach a F1 > 93 on SQuAD:
+
+.. code-block:: bash
+
+   python -m torch.distributed.launch --nproc_per_node=8 \
+    run_bert_squad.py \
+    --bert_model bert-large-uncased-whole-word-masking  \
+    --do_train \
+    --do_predict \
+    --do_lower_case \
+    --train_file $SQUAD_DIR/train-v1.1.json \
+    --predict_file $SQUAD_DIR/dev-v1.1.json \
+    --learning_rate 3e-5 \
+    --num_train_epochs 2 \
+    --max_seq_length 384 \
+    --doc_stride 128 \
+    --output_dir ../models/wwm_uncased_finetuned_squad/ \
+    --train_batch_size 24 \
+    --gradient_accumulation_steps 12
+
+Training with these hyper-parameters gave us the following results:
+
+.. code-block:: bash
+
+   python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
+   {"exact_match": 86.91579943235573, "f1": 93.1532499015869}
+
+This is the model provided as ``bert-large-uncased-whole-word-masking-finetuned-squad``.
+
+And here is the model provided as ``bert-large-cased-whole-word-masking-finetuned-squad``\ :
+
+.. code-block:: bash
+
+    python -m torch.distributed.launch --nproc_per_node=8  run_bert_squad.py \
+        --bert_model bert-large-cased-whole-word-masking \
+        --do_train \
+        --do_predict \
+        --do_lower_case \
+        --train_file $SQUAD_DIR/train-v1.1.json \
+        --predict_file $SQUAD_DIR/dev-v1.1.json \
+        --learning_rate 3e-5 \
+        --num_train_epochs 2 \
+        --max_seq_length 384 \
+        --doc_stride 128 \
+        --output_dir ../models/wwm_cased_finetuned_squad/ \
+        --train_batch_size 24 \
+        --gradient_accumulation_steps 12
+
+Training with these hyper-parameters gave us the following results:
+
+.. code-block:: bash
+
+   python $SQUAD_DIR/evaluate-v1.1.py $SQUAD_DIR/dev-v1.1.json ../models/wwm_uncased_finetuned_squad/predictions.json
+   {"exact_match": 84.18164616840113, "f1": 91.58645594850135}
+
+SWAG
+~~~~
+
+The data for SWAG can be downloaded by cloning the following `repository <https://github.com/rowanz/swagaf>`_
+
+.. code-block:: shell
+
+   export SWAG_DIR=/path/to/SWAG
+
+   python run_bert_swag.py \
+     --bert_model bert-base-uncased \
+     --do_train \
+     --do_lower_case \
+     --do_eval \
+     --data_dir $SWAG_DIR/data \
+     --train_batch_size 16 \
+     --learning_rate 2e-5 \
+     --num_train_epochs 3.0 \
+     --max_seq_length 80 \
+     --output_dir /tmp/swag_output/ \
+     --gradient_accumulation_steps 4
+
+Training with the previous hyper-parameters on a single GPU gave us the following results:
+
+.. code-block::
+
+   eval_accuracy = 0.8062081375587323
+   eval_loss = 0.5966546792367169
+   global_step = 13788
+   loss = 0.06423990014260186
+
+LM Fine-tuning
+~~~~~~~~~~~~~~
+
+The data should be a text file in the same format as `sample_text.txt <./samples/sample_text.txt>`_  (one sentence per line, docs separated by empty line).
+You can download an `exemplary training corpus <https://ext-bert-sample.obs.eu-de.otc.t-systems.com/small_wiki_sentence_corpus.txt>`_ generated from wikipedia articles and split into ~500k sentences with spaCy.
+Training one epoch on this corpus takes about 1:20h on 4 x NVIDIA Tesla P100 with ``train_batch_size=200`` and ``max_seq_length=128``\ :
+
+Thank to the work of @Rocketknight1 and @tholor there are now **several scripts** that can be used to fine-tune BERT using the pretraining objective (combination of masked-language modeling and next sentence prediction loss). These scripts are detailed in the `README <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/lm_finetuning/README.md>`_ of the `examples/lm_finetuning/ <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/examples/lm_finetuning/>`_ folder.
+
+.. _fine-tuning:
+
+OpenAI GPT, Transformer-XL and GPT-2: running the examples
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+We provide three examples of scripts for OpenAI GPT, Transformer-XL and OpenAI GPT-2 based on (and extended from) the respective original implementations:
+
+
+* fine-tuning OpenAI GPT on the ROCStories dataset
+* evaluating Transformer-XL on Wikitext 103
+* unconditional and conditional generation from a pre-trained OpenAI GPT-2 model
+
+Fine-tuning OpenAI GPT on the RocStories dataset
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This example code fine-tunes OpenAI GPT on the RocStories dataset.
+
+Before running this example you should download the
+`RocStories dataset <https://github.com/snigdhac/StoryComprehension_EMNLP/tree/master/Dataset/RoCStories>`_ and unpack it to some directory ``$ROC_STORIES_DIR``.
+
+.. code-block:: shell
+
+   export ROC_STORIES_DIR=/path/to/RocStories
+
+   python run_openai_gpt.py \
+     --model_name openai-gpt \
+     --do_train \
+     --do_eval \
+     --train_dataset $ROC_STORIES_DIR/cloze_test_val__spring2016\ -\ cloze_test_ALL_val.csv \
+     --eval_dataset $ROC_STORIES_DIR/cloze_test_test__spring2016\ -\ cloze_test_ALL_test.csv \
+     --output_dir ../log \
+     --train_batch_size 16 \
+
+This command runs in about 10 min on a single K-80 an gives an evaluation accuracy of about 87.7% (the authors report a median accuracy with the TensorFlow code of 85.8% and the OpenAI GPT paper reports a best single run accuracy of 86.5%).
+
+Evaluating the pre-trained Transformer-XL on the WikiText 103 dataset
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This example code evaluate the pre-trained Transformer-XL on the WikiText 103 dataset.
+This command will download a pre-processed version of the WikiText 103 dataset in which the vocabulary has been computed.
+
+.. code-block:: shell
+
+   python run_transfo_xl.py --work_dir ../log
+
+This command runs in about 1 min on a V100 and gives an evaluation perplexity of 18.22 on WikiText-103 (the authors report a perplexity of about 18.3 on this dataset with the TensorFlow code).
+
+Unconditional and conditional generation from OpenAI's GPT-2 model
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+This example code is identical to the original unconditional and conditional generation codes.
+
+Conditional generation:
+
+.. code-block:: shell
+
+   python run_gpt2.py
+
+Unconditional generation:
+
+.. code-block:: shell
+
+   python run_gpt2.py --unconditional
+
+The same option as in the original scripts are provided, please refere to the code of the example and the original repository of OpenAI.
+
+.. _fine-tuning-BERT-large:
+
+Fine-tuning BERT-large on GPUs
+------------------------------
+
+The options we list above allow to fine-tune BERT-large rather easily on GPU(s) instead of the TPU used by the original implementation.
+
+For example, fine-tuning BERT-large on SQuAD can be done on a server with 4 k-80 (these are pretty old now) in 18 hours. Our results are similar to the TensorFlow implementation results (actually slightly higher):
+
+.. code-block:: bash
+
+   {"exact_match": 84.56953642384106, "f1": 91.04028647786927}
+
+To get these results we used a combination of:
+
+
+* multi-GPU training (automatically activated on a multi-GPU server),
+* 2 steps of gradient accumulation and
+* perform the optimization step on CPU to store Adam's averages in RAM.
+
+Here is the full list of hyper-parameters for this run:
+
+.. code-block:: bash
+
+   export SQUAD_DIR=/path/to/SQUAD
+
+   python ./run_bert_squad.py \
+     --bert_model bert-large-uncased \
+     --do_train \
+     --do_predict \
+     --do_lower_case \
+     --train_file $SQUAD_DIR/train-v1.1.json \
+     --predict_file $SQUAD_DIR/dev-v1.1.json \
+     --learning_rate 3e-5 \
+     --num_train_epochs 2 \
+     --max_seq_length 384 \
+     --doc_stride 128 \
+     --output_dir /tmp/debug_squad/ \
+     --train_batch_size 24 \
+     --gradient_accumulation_steps 2
+
+If you have a recent GPU (starting from NVIDIA Volta series), you should try **16-bit fine-tuning** (FP16).
+
+Here is an example of hyper-parameters for a FP16 run we tried:
+
+.. code-block:: bash
+
+   export SQUAD_DIR=/path/to/SQUAD
+
+   python ./run_bert_squad.py \
+     --bert_model bert-large-uncased \
+     --do_train \
+     --do_predict \
+     --do_lower_case \
+     --train_file $SQUAD_DIR/train-v1.1.json \
+     --predict_file $SQUAD_DIR/dev-v1.1.json \
+     --learning_rate 3e-5 \
+     --num_train_epochs 2 \
+     --max_seq_length 384 \
+     --doc_stride 128 \
+     --output_dir /tmp/debug_squad/ \
+     --train_batch_size 24 \
+     --fp16 \
+     --loss_scale 128
+
+The results were similar to the above FP32 results (actually slightly higher):
+
+.. code-block:: bash
+
+   {"exact_match": 84.65468306527909, "f1": 91.238669287002}
+
+Here is an example with the recent ``bert-large-uncased-whole-word-masking``\ :
+
+.. code-block:: bash
+
+   python -m torch.distributed.launch --nproc_per_node=8 \
+     run_bert_squad.py \
+     --bert_model bert-large-uncased-whole-word-masking \
+     --do_train \
+     --do_predict \
+     --do_lower_case \
+     --train_file $SQUAD_DIR/train-v1.1.json \
+     --predict_file $SQUAD_DIR/dev-v1.1.json \
+     --learning_rate 3e-5 \
+     --num_train_epochs 2 \
+     --max_seq_length 384 \
+     --doc_stride 128 \
+     --output_dir /tmp/debug_squad/ \
+     --train_batch_size 24 \
+     --gradient_accumulation_steps 2
+
+Fine-tuning XLNet
+-----------------
+
+STS-B
+~~~~~
+
+This example code fine-tunes XLNet on the STS-B corpus.
+
+Before running this example you should download the
+`GLUE data <https://gluebenchmark.com/tasks>`_ by running
+`this script <https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e>`_
+and unpack it to some directory ``$GLUE_DIR``.
+
+.. code-block:: shell
+
+   export GLUE_DIR=/path/to/glue
+
+   python run_xlnet_classifier.py \
+    --task_name STS-B \
+    --do_train \
+    --do_eval \
+    --data_dir $GLUE_DIR/STS-B/ \
+    --max_seq_length 128 \
+    --train_batch_size 8 \
+    --gradient_accumulation_steps 1 \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --output_dir /tmp/mrpc_output/
+
+Our test ran on a few seeds with `the original implementation hyper-parameters <https://github.com/zihangdai/xlnet#1-sts-b-sentence-pair-relevance-regression-with-gpus>`__ gave evaluation results between 84% and 88%.
+
+**Distributed training**
+Here is an example using distributed training on 8 V100 GPUs to reach XXXX:
+
+.. code-block:: bash
+
+   python -m torch.distributed.launch --nproc_per_node 8 \
+    run_xlnet_classifier.py \
+    --task_name STS-B \
+    --do_train \
+    --do_eval \
+    --data_dir $GLUE_DIR/STS-B/ \
+    --max_seq_length 128 \
+    --train_batch_size 8 \
+    --gradient_accumulation_steps 1 \
+    --learning_rate 5e-5 \
+    --num_train_epochs 3.0 \
+    --output_dir /tmp/mrpc_output/
+
+Training with these hyper-parameters gave us the following results:
+
+.. code-block:: bash
+
+     acc = 0.8823529411764706
+     acc_and_f1 = 0.901702786377709
+     eval_loss = 0.3418912578906332
+     f1 = 0.9210526315789473
+     global_step = 174
+     loss = 0.07231863956341798
+
+Here is an example on MNLI:
+
+.. code-block:: bash
+
+    python -m torch.distributed.launch --nproc_per_node 8 run_bert_classifier.py \
+        --bert_model bert-large-uncased-whole-word-masking \
+        --task_name mnli \
+        --do_train \
+        --do_eval \
+        --data_dir /datadrive/bert_data/glue_data//MNLI/ \
+        --max_seq_length 128 \
+        --train_batch_size 8 \
+        --learning_rate 2e-5 \
+        --num_train_epochs 3.0 \
+        --output_dir ../models/wwm-uncased-finetuned-mnli/ \
+        --overwrite_output_dir
+
+.. code-block:: bash
+
+   ***** Eval results *****
+     acc = 0.8679706601466992
+     eval_loss = 0.4911287787382479
+     global_step = 18408
+     loss = 0.04755385363816904
+
+   ***** Eval results *****
+     acc = 0.8747965825874695
+     eval_loss = 0.45516540421714036
+     global_step = 18408
+     loss = 0.04755385363816904
+
+This is the example of the ``bert-large-uncased-whole-word-masking-finetuned-mnli`` model.
--- a/docs/source/imgs/warmup_constant_schedule.png
+++ b/docs/source/imgs/warmup_constant_schedule.png
--- a/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png
+++ b/docs/source/imgs/warmup_cosine_hard_restarts_schedule.png
--- a/docs/source/imgs/warmup_cosine_schedule.png
+++ b/docs/source/imgs/warmup_cosine_schedule.png
--- a/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png
+++ b/docs/source/imgs/warmup_cosine_warm_restarts_schedule.png
--- a/docs/source/imgs/warmup_linear_schedule.png
+++ b/docs/source/imgs/warmup_linear_schedule.png
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -0,0 +1,290 @@
+Pytorch-Transformers
+================================================================================================================================================
+
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Notes
+
+    installation
+    philosophy
+    usage
+    examples
+    notebooks
+    converting_tensorflow_models
+    migration
+    bertology
+    torchscript
+
+
+.. toctree::
+    :maxdepth: 2
+    :caption: Package Reference
+
+    model_doc/overview
+    model_doc/bert
+    model_doc/gpt
+    model_doc/transformerxl
+    model_doc/gpt2
+    model_doc/xlm
+    model_doc/xlnet
+
+
+.. image:: https://circleci.com/gh/huggingface/pytorch-pretrained-BERT.svg?style=svg
+   :target: https://circleci.com/gh/huggingface/pytorch-pretrained-BERT
+   :alt: CircleCI
+
+
+This repository contains op-for-op PyTorch reimplementations, pre-trained models and fine-tuning examples for:
+
+
+* `Google's BERT model <https://github.com/google-research/bert>`__\ ,
+* `OpenAI's GPT model <https://github.com/openai/finetune-transformer-lm>`__\ ,
+* `Google/CMU's Transformer-XL model <https://github.com/kimiyoung/transformer-xl>`__\ , and
+* `OpenAI's GPT-2 model <https://blog.openai.com/better-language-models/>`__.
+
+These implementations have been tested on several datasets (see the examples) and should match the performances of the associated TensorFlow implementations (e.g. ~91 F1 on SQuAD for BERT, ~88 F1 on RocStories for OpenAI GPT and ~18.3 perplexity on WikiText 103 for the Transformer-XL). You can find more details in the `Examples <./examples.html>`__ section.
+
+Here are some information on these models:
+
+**BERT** was released together with the paper `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding <https://arxiv.org/abs/1810.04805>`__ by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
+This PyTorch implementation of BERT is provided with `Google's pre-trained models <https://github.com/google-research/bert>`__\ , examples, notebooks and a command-line interface to load any pre-trained TensorFlow checkpoint for BERT is also provided.
+
+**OpenAI GPT** was released together with the paper `Improving Language Understanding by Generative Pre-Training <https://blog.openai.com/language-unsupervised/>`__ by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+This PyTorch implementation of OpenAI GPT is an adaptation of the `PyTorch implementation by HuggingFace <https://github.com/huggingface/pytorch-openai-transformer-lm>`__ and is provided with `OpenAI's pre-trained model <https://github.com/openai/finetune-transformer-lm>`__ and a command-line interface that was used to convert the pre-trained NumPy checkpoint in PyTorch.
+
+**Google/CMU's Transformer-XL** was released together with the paper `Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context <http://arxiv.org/abs/1901.02860>`__ by Zihang Dai\*, Zhilin Yang\* , Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
+This PyTorch implementation of Transformer-XL is an adaptation of the original `PyTorch implementation <https://github.com/kimiyoung/transformer-xl>`__ which has been slightly modified to match the performances of the TensorFlow implementation and allow to re-use the pretrained weights. A command-line interface is provided to convert TensorFlow checkpoints in PyTorch models.
+
+**OpenAI GPT-2** was released together with the paper `Language Models are Unsupervised Multitask Learners <https://blog.openai.com/better-language-models/>`__ by Alec Radford\*, Jeffrey Wu\* , Rewon Child, David Luan, Dario Amodei\*\* and Ilya Sutskever\*\*.
+This PyTorch implementation of OpenAI GPT-2 is an adaptation of the `OpenAI's implementation <https://github.com/openai/gpt-2>`__ and is provided with `OpenAI's pre-trained model <https://github.com/openai/gpt-2>`__ and a command-line interface that was used to convert the TensorFlow checkpoint in PyTorch.
+
+**Facebook Research's XLM** was released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`__ by Guillaume Lample and Alexis Conneau.
+This PyTorch implementation of XLM is an adaptation of the original `PyTorch implementation <https://github.com/facebookresearch/XLM>`__.
+
+**Google's XLNet** was released together with the paper `XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`__ by Zhilin Yang\*, Zihang Dai\*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov and Quoc V. Le.
+This PyTorch implementation of XLM is an adaptation of the `Tensorflow implementation <https://github.com/zihangdai/xlnet>`__.
+
+
+Content
+-------
+
+.. list-table::
+   :header-rows: 1
+
+   * - Section
+     - Description
+   * - `Installation <./installation.html>`__
+     - How to install the package
+   * - `Philosphy <./philosophy.html>`__
+     - The philosophy behind this package
+   * - `Usage <./usage.html>`__
+     - Quickstart examples
+   * - `Examples <./examples.html>`__
+     - Detailed examples on how to fine-tune Bert
+   * - `Notebooks <./notebooks.html>`__
+     - Introduction on the provided Jupyter Notebooks
+   * - `TPU <./tpu.html>`__
+     - Notes on TPU support and pretraining scripts
+   * - `Command-line interface <./cli.html>`__
+     - Convert a TensorFlow checkpoint in a PyTorch dump
+   * - `Migration <./migration.html>`__
+     - Migrating from ``pytorch_pretrained_BERT`` (v0.6) to ``pytorch_transformers`` (v1.0)
+   * - `Bertology <./bertology.html>`__
+     - Exploring the internals of the pretrained models.
+   * - `TorchScript <./torchscript.html>`__
+     - Convert a model to TorchScript for use in other programming languages
+
+.. list-table::
+   :header-rows: 1
+
+   * - Section
+     - Description
+   * - `Overview <./model_doc/overview.html>`__
+     - Overview of the package
+   * - `BERT <./model_doc/bert.html>`__
+     - BERT Models, Tokenizers and optimizers
+   * - `OpenAI GPT <./model_doc/gpt.html>`__
+     - GPT Models, Tokenizers and optimizers
+   * - `TransformerXL <./model_doc/transformerxl.html>`__
+     - TransformerXL Models, Tokenizers and optimizers
+   * - `OpenAI GPT2 <./model_doc/gpt2.html>`__
+     - GPT2 Models, Tokenizers and optimizers
+   * - `XLM <./model_doc/xlm.html>`__
+     - XLM Models, Tokenizers and optimizers
+   * - `XLNet <./model_doc/xlnet.html>`__
+     - XLNet Models, Tokenizers and optimizers
+
+Overview
+--------
+
+This package comprises the following classes that can be imported in Python and are detailed in the `documentation <./model_doc/overview.html>`__ section of this package:
+
+
+*
+  Eight **Bert** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_bert.py <./_modules/pytorch_transformers/modeling_bert.html>`__ file):
+
+
+  * `BertModel <./model_doc/bert.html#pytorch_transformers.BertModel>`__ - raw BERT Transformer model (\ **fully pre-trained**\ ),
+  * `BertForMaskedLM <./model_doc/bert.html#pytorch_transformers.BertForMaskedLM>`__ - BERT Transformer with the pre-trained masked language modeling head on top (\ **fully pre-trained**\ ),
+  * `BertForNextSentencePrediction <./model_doc/bert.html#pytorch_transformers.BertForNextSentencePrediction>`__ - BERT Transformer with the pre-trained next sentence prediction classifier on top  (\ **fully pre-trained**\ ),
+  * `BertForPreTraining <./model_doc/bert.html#pytorch_transformers.BertForPreTraining>`__ - BERT Transformer with masked language modeling head and next sentence prediction classifier on top (\ **fully pre-trained**\ ),
+  * `BertForSequenceClassification <./model_doc/bert.html#pytorch_transformers.BertForSequenceClassification>`__ - BERT Transformer with a sequence classification head on top (BERT Transformer is **pre-trained**\ , the sequence classification head **is only initialized and has to be trained**\ ),
+  * `BertForMultipleChoice <./model_doc/bert.html#pytorch_transformers.BertForMultipleChoice>`__ - BERT Transformer with a multiple choice head on top (used for task like Swag) (BERT Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
+  * `BertForTokenClassification <./model_doc/bert.html#pytorch_transformers.BertForTokenClassification>`__ - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ ),
+  * `BertForQuestionAnswering <./model_doc/bert.html#pytorch_transformers.BertForQuestionAnswering>`__ - BERT Transformer with a token classification head on top (BERT Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ ).
+
+*
+  Three **OpenAI GPT** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_openai.py <./_modules/pytorch_transformers/modeling_openai.html>`__ file):
+
+
+  * `OpenAIGPTModel <./model_doc/gpt.html#pytorch_transformers.OpenAIGPTModel>`__ - raw OpenAI GPT Transformer model (\ **fully pre-trained**\ ),
+  * `OpenAIGPTLMHeadModel <./model_doc/gpt.html#pytorch_transformers.OpenAIGPTLMHeadModel>`__ - OpenAI GPT Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
+  * `OpenAIGPTDoubleHeadsModel <./model_doc/gpt.html#pytorch_transformers.OpenAIGPTDoubleHeadsModel>`__ - OpenAI GPT Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
+
+*
+  Two **Transformer-XL** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_transfo_xl.py <./_modules/pytorch_transformers/modeling_transfo_xl.html>`__ file):
+
+
+  * `TransfoXLModel <./model_doc/transformerxl.html#pytorch_transformers.TransfoXLModel>`__ - Transformer-XL model which outputs the last hidden state and memory cells (\ **fully pre-trained**\ ),
+  * `TransfoXLLMHeadModel <./model_doc/transformerxl.html#pytorch_transformers.TransfoXLLMHeadModel>`__ - Transformer-XL with the tied adaptive softmax head on top for language modeling which outputs the logits/loss and memory cells (\ **fully pre-trained**\ ),
+
+*
+  Three **OpenAI GPT-2** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_gpt2.py <./_modules/pytorch_transformers/modeling_gpt2.html>`__ file):
+
+
+  * `GPT2Model <./model_doc/gpt2.html#pytorch_transformers.GPT2Model>`__ - raw OpenAI GPT-2 Transformer model (\ **fully pre-trained**\ ),
+  * `GPT2LMHeadModel <./model_doc/gpt2.html#pytorch_transformers.GPT2LMHeadModel>`__ - OpenAI GPT-2 Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
+  * `GPT2DoubleHeadsModel <./model_doc/gpt2.html#pytorch_transformers.GPT2DoubleHeadsModel>`__ - OpenAI GPT-2 Transformer with the tied language modeling head and a multiple choice classification head on top (OpenAI GPT-2 Transformer is **pre-trained**\ , the multiple choice classification head **is only initialized and has to be trained**\ ),
+
+*
+  Four **XLM** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_xlm.py <./_modules/pytorch_transformers/modeling_xlm.html>`__ file):
+
+
+  * `XLMModel <./model_doc/xlm.html#pytorch_transformers.XLMModel>`__ - raw XLM Transformer model (\ **fully pre-trained**\ ),
+  * `XLMWithLMHeadModel <./model_doc/xlm.html#pytorch_transformers.XLMWithLMHeadModel>`__ - XLM Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
+  * `XLMForSequenceClassification <./model_doc/xlm.html#pytorch_transformers.XLMForSequenceClassification>`__ - XLM Transformer with a sequence classification head on top (XLM Transformer is **pre-trained**\ , the sequence classification head **is only initialized and has to be trained**\ ),
+  * `XLMForQuestionAnswering <./model_doc/xlm.html#pytorch_transformers.XLMForQuestionAnswering>`__ - XLM Transformer with a token classification head on top (XLM Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ )
+
+*
+  Four **XLNet** PyTorch models (\ ``torch.nn.Module``\ ) with pre-trained weights (in the `modeling_xlnet.py <./_modules/pytorch_transformers/modeling_xlnet.html>`__ file):
+
+
+  * `XLNetModel <./model_doc/xlnet.html#pytorch_transformers.XLNetModel>`__ - raw XLNet Transformer model (\ **fully pre-trained**\ ),
+  * `XLNetLMHeadModel <./model_doc/xlnet.html#pytorch_transformers.XLNetLMHeadModel>`__ - XLNet Transformer with the tied language modeling head on top (\ **fully pre-trained**\ ),
+  * `XLNetForSequenceClassification <./model_doc/xlnet.html#pytorch_transformers.XLNetForSequenceClassification>`__ - XLNet Transformer with a sequence classification head on top (XLM Transformer is **pre-trained**\ , the sequence classification head **is only initialized and has to be trained**\ ),
+  * `XLNetForQuestionAnswering <./model_doc/xlnet.html#pytorch_transformers.XLNetForQuestionAnswering>`__ - XLNet Transformer with a token classification head on top (XLNet Transformer is **pre-trained**\ , the token classification head **is only initialized and has to be trained**\ )
+
+
+TODO Lysandre filled: I filled in XLM and XLNet. I didn't do the Tokenizers because I don't know the current philosophy behind them.
+
+*
+  Tokenizers for **BERT** (using word-piece) (in the `tokenization_bert.py <./_modules/pytorch_transformers/tokenization_bert.html>`__ file):
+
+  * ``BasicTokenizer`` - basic tokenization (punctuation splitting, lower casing, etc.),
+  * ``WordpieceTokenizer`` - WordPiece tokenization,
+  * ``BertTokenizer`` - perform end-to-end tokenization, i.e. basic tokenization followed by WordPiece tokenization.
+
+
+*
+  Tokenizer for **OpenAI GPT** (using Byte-Pair-Encoding) (in the `tokenization_openai.py <./_modules/pytorch_transformers/tokenization_openai.html>`__ file):
+
+  * ``OpenAIGPTTokenizer`` - perform Byte-Pair-Encoding (BPE) tokenization.
+
+
+*
+  Tokenizer for **OpenAI GPT-2** (using byte-level Byte-Pair-Encoding) (in the `tokenization_gpt2.py <./_modules/pytorch_transformers/tokenization_gpt2.html>`__ file):
+
+  * ``GPT2Tokenizer`` - perform byte-level Byte-Pair-Encoding (BPE) tokenization.
+
+
+*
+  Tokenizer for **Transformer-XL** (word tokens ordered by frequency for adaptive softmax) (in the `tokenization_transfo_xl.py <./_modules/pytorch_transformers/tokenization_transfo_xl.html>`__ file):
+
+  * ``OpenAIGPTTokenizer`` - perform word tokenization and can order words by frequency in a corpus for use in an adaptive softmax.
+
+
+*
+  Tokenizer for **XLNet** (SentencePiece based tokenizer) (in the `tokenization_xlnet.py <./_modules/pytorch_transformers/tokenization_xlnet.html>`__ file):
+
+  * ``XLNetTokenizer`` - perform SentencePiece tokenization.
+
+
+*
+  Tokenizer for **XLM** (using Byte-Pair-Encoding) (in the `tokenization_xlm.py <./_modules/pytorch_transformers/tokenization_xlm.html>`__ file):
+
+  * ``GPT2Tokenizer`` - perform Byte-Pair-Encoding (BPE) tokenization.
+
+
+*
+  Optimizer (in the `optimization.py <./_modules/pytorch_transformers/optimization.html>`__ file):
+
+
+  * ``AdamW`` - Version of Adam algorithm with weight decay fix, warmup and linear decay of the learning rate.
+
+
+*
+  Configuration classes for BERT, OpenAI GPT, Transformer-XL, XLM and XLNet (in the respective \
+  `modeling_bert.py <./_modules/pytorch_transformers/modeling_bert.html>`__\ , \
+  `modeling_openai.py <./_modules/pytorch_transformers/modeling_openai.html>`__\ , \
+  `modeling_transfo_xl.py <./_modules/pytorch_transformers/modeling_transfo_xl.html>`__, \
+  `modeling_xlm.py <./_modules/pytorch_transformers/modeling_xlm.html>`__, \
+  `modeling_xlnet.py <./_modules/pytorch_transformers/modeling_xlnet.html>`__ \
+  files):
+
+
+  * ``BertConfig`` - Configuration class to store the configuration of a ``BertModel`` with utilities to read and write from JSON configuration files.
+  * ``OpenAIGPTConfig`` - Configuration class to store the configuration of a ``OpenAIGPTModel`` with utilities to read and write from JSON configuration files.
+  * ``GPT2Config`` - Configuration class to store the configuration of a ``GPT2Model`` with utilities to read and write from JSON configuration files.
+  * ``TransfoXLConfig`` - Configuration class to store the configuration of a ``TransfoXLModel`` with utilities to read and write from JSON configuration files.
+  * ``XLMConfig`` - Configuration class to store the configuration of a ``XLMModel`` with utilities to read and write from JSON configuration files.
+  * ``XLNetConfig`` - Configuration class to store the configuration of a ``XLNetModel`` with utilities to read and write from JSON configuration files.
+
+The repository further comprises:
+
+
+*
+  Five examples on how to use **BERT** (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`__\ ):
+
+
+  * `run_bert_extract_features.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_bert_extract_features.py>`__ - Show how to extract hidden states from an instance of ``BertModel``\ ,
+  * `run_bert_classifier.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_bert_classifier.py>`__ - Show how to fine-tune an instance of ``BertForSequenceClassification`` on GLUE's MRPC task,
+  * `run_bert_squad.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_bert_squad.py>`__ - Show how to fine-tune an instance of ``BertForQuestionAnswering`` on SQuAD v1.0 and SQuAD v2.0 tasks.
+  * `run_swag.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_swag.py>`__ - Show how to fine-tune an instance of ``BertForMultipleChoice`` on Swag task.
+  * `simple_lm_finetuning.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/lm_finetuning/simple_lm_finetuning.py>`__ - Show how to fine-tune an instance of ``BertForPretraining`` on a target text corpus.
+
+*
+  One example on how to use **OpenAI GPT** (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`__\ ):
+
+
+  * `run_openai_gpt.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_openai_gpt.py>`__ - Show how to fine-tune an instance of ``OpenGPTDoubleHeadsModel`` on the RocStories task.
+
+*
+  One example on how to use **Transformer-XL** (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`__\ ):
+
+
+  * `run_transfo_xl.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_transfo_xl.py>`__ - Show how to load and evaluate a pre-trained model of ``TransfoXLLMHeadModel`` on WikiText 103.
+
+*
+  One example on how to use **OpenAI GPT-2** in the unconditional and interactive mode (in the `examples folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples>`__\ ):
+
+
+  * `run_gpt2.py <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/examples/run_gpt2.py>`__ - Show how to use OpenAI GPT-2 an instance of ``GPT2LMHeadModel`` to generate text (same as the original OpenAI GPT-2 examples).
+
+  These examples are detailed in the `Examples <#examples>`__ section of this readme.
+
+*
+  Three notebooks that were used to check that the TensorFlow and PyTorch models behave identically (in the `notebooks folder <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks>`__\ ):
+
+
+  * `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing-TF-and-PT-models.ipynb>`__ - Compare the hidden states predicted by ``BertModel``\ ,
+  * `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`__ - Compare the spans predicted by  ``BertForQuestionAnswering`` instances,
+  * `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`__ - Compare the predictions of the ``BertForPretraining`` instances.
+
+  These notebooks are detailed in the `Notebooks <#notebooks>`__ section of this readme.
+
+
+*
+  A command-line interface to convert TensorFlow checkpoints (BERT, Transformer-XL) or NumPy checkpoint (OpenAI) in a PyTorch save of the associated PyTorch model:
+
+  This CLI is detailed in the `Command-line interface <#Command-line-interface>`__ section of this readme.
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -0,0 +1,48 @@
+Installation
+================================================
+
+This repo was tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 0.4.1/1.0.0
+
+With pip
+^^^^^^^^
+
+PyTorch pretrained bert can be installed by pip as follows:
+
+.. code-block:: bash
+
+   pip install pytorch-pretrained-bert
+
+If you want to reproduce the original tokenization process of the ``OpenAI GPT`` paper, you will need to install ``ftfy`` (limit to version 4.4.3 if you are using Python 2) and ``SpaCy`` :
+
+.. code-block:: bash
+
+   pip install spacy ftfy==4.4.3
+   python -m spacy download en
+
+If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
+
+From source
+^^^^^^^^^^^
+
+Clone the repository and run:
+
+.. code-block:: bash
+
+   pip install [--editable] .
+
+Here also, if you want to reproduce the original tokenization process of the ``OpenAI GPT`` model, you will need to install ``ftfy`` (limit to version 4.4.3 if you are using Python 2) and ``SpaCy`` :
+
+.. code-block:: bash
+
+   pip install spacy ftfy==4.4.3
+   python -m spacy download en
+
+Again, if you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage).
+
+A series of tests is included in the `tests folder <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/tests>`_ and can be run using ``pytest`` (install pytest if needed: ``pip install pytest``\ ).
+
+You can run the tests with the command:
+
+.. code-block:: bash
+
+   python -m pytest -sv tests/
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -0,0 +1 @@
+# Migration
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -0,0 +1,78 @@
+BERT
+----------------------------------------------------
+
+``BertConfig``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.BertConfig
+    :members:
+
+
+``BertTokenizer``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.BertTokenizer
+    :members:
+
+
+``AdamW``
+~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.AdamW
+    :members:
+
+``BertModel``
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.BertModel
+    :members:
+
+
+``BertForPreTraining``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.BertForPreTraining
+    :members:
+
+
+``BertForMaskedLM``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.BertForMaskedLM
+    :members:
+
+
+``BertForNextSentencePrediction``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.BertForNextSentencePrediction
+    :members:
+
+
+``BertForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.BertForSequenceClassification
+    :members:
+
+
+``BertForMultipleChoice``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.BertForMultipleChoice
+    :members:
+
+
+``BertForTokenClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.BertForTokenClassification
+    :members:
+
+
+``BertForQuestionAnswering``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.BertForQuestionAnswering
+    :members:
+
--- a/docs/source/model_doc/gpt.rst
+++ b/docs/source/model_doc/gpt.rst
@@ -0,0 +1,36 @@
+OpenAI GPT
+----------------------------------------------------
+
+``OpenAIGPTConfig``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.OpenAIGPTConfig
+    :members:
+
+
+``OpenAIGPTTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.OpenAIGPTTokenizer
+    :members:
+
+
+``OpenAIGPTModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.OpenAIGPTModel
+    :members:
+
+
+``OpenAIGPTLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.OpenAIGPTLMHeadModel
+    :members:
+
+
+``OpenAIGPTDoubleHeadsModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.OpenAIGPTDoubleHeadsModel
+    :members:
--- a/docs/source/model_doc/gpt2.rst
+++ b/docs/source/model_doc/gpt2.rst
@@ -0,0 +1,36 @@
+OpenAI GPT2
+----------------------------------------------------
+
+``GPT2Config``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.GPT2Config
+    :members:
+
+
+``GPT2Tokenizer``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.GPT2Tokenizer
+    :members:
+
+
+``GPT2Model``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.GPT2Model
+    :members:
+
+
+``GPT2LMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.GPT2LMHeadModel
+    :members:
+
+
+``GPT2DoubleHeadsModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.GPT2DoubleHeadsModel
+    :members:
--- a/docs/source/model_doc/overview.rst
+++ b/docs/source/model_doc/overview.rst
@@ -0,0 +1,285 @@
+Overview
+================================================
+
+
+Here is a detailed documentation of the classes in the package and how to use them:
+
+.. list-table::
+   :header-rows: 1
+
+   * - Sub-section
+     - Description
+   * - `Loading pre-trained weights <#loading-google-ai-or-openai-pre-trained-weights-or-pytorch-dump>`__
+     - How to load Google AI/OpenAI's pre-trained weight or a PyTorch saved instance
+   * - `Serialization best-practices <#serialization-best-practices>`__
+     - How to save and reload a fine-tuned model
+   * - `Configurations <#configurations>`__
+     - API of the configuration classes for BERT, GPT, GPT-2 and Transformer-XL
+
+
+TODO Lysandre filled: Removed Models/Tokenizers/Optimizers as no single link can be made.
+
+
+Configurations
+^^^^^^^^^^^^^^
+
+Models (BERT, GPT, GPT-2 and Transformer-XL) are defined and build from configuration classes which contains the
+parameters of the models (number of layers, dimensionalities...) and a few utilities to read and write from JSON
+configuration files. The respective configuration classes are:
+
+
+* ``BertConfig`` for ``BertModel`` and BERT classes instances.
+* ``OpenAIGPTConfig`` for ``OpenAIGPTModel`` and OpenAI GPT classes instances.
+* ``GPT2Config`` for ``GPT2Model`` and OpenAI GPT-2 classes instances.
+* ``TransfoXLConfig`` for ``TransfoXLModel`` and Transformer-XL classes instances.
+
+These configuration classes contains a few utilities to load and save configurations:
+
+
+* ``from_dict(cls, json_object)``\ : A class method to construct a configuration from a Python dictionary of parameters. Returns an instance of the configuration class.
+* ``from_json_file(cls, json_file)``\ : A class method to construct a configuration from a json file of parameters. Returns an instance of the configuration class.
+* ``to_dict()``\ : Serializes an instance to a Python dictionary. Returns a dictionary.
+* ``to_json_string()``\ : Serializes an instance to a JSON string. Returns a string.
+* ``to_json_file(json_file_path)``\ : Save an instance to a json file.
+
+
+Loading Google AI or OpenAI pre-trained weights or PyTorch dump
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+``from_pretrained()`` method
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of ``BertForPreTraining`` saved with ``torch.save()``\ ), the PyTorch model classes and the tokenizer can be instantiated using the ``from_pretrained()`` method:
+
+.. code-block:: python
+
+   model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
+
+where
+
+
+* ``BERT_CLASS`` is either a tokenizer to load the vocabulary (\ ``BertTokenizer`` or ``OpenAIGPTTokenizer`` classes) or one of the eight BERT or three OpenAI GPT PyTorch model classes (to load the pre-trained weights): ``BertModel``\ , ``BertForMaskedLM``\ , ``BertForNextSentencePrediction``\ , ``BertForPreTraining``\ , ``BertForSequenceClassification``\ , ``BertForTokenClassification``\ , ``BertForMultipleChoice``\ , ``BertForQuestionAnswering``\ , ``OpenAIGPTModel``\ , ``OpenAIGPTLMHeadModel`` or ``OpenAIGPTDoubleHeadsModel``\ , and
+*
+  ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is either:
+
+
+  *
+    the shortcut name of a Google AI's or OpenAI's pre-trained model selected in the list:
+
+
+    * ``bert-base-uncased``: 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``bert-large-uncased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
+    * ``bert-base-cased``: 12-layer, 768-hidden, 12-heads , 110M parameters
+    * ``bert-large-cased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
+    * ``bert-base-multilingual-uncased``: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``bert-base-multilingual-cased``: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``bert-base-chinese``: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``bert-base-german-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://deepset.ai/german-bert>`__
+    * ``bert-large-uncased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    * ``bert-large-cased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    * ``bert-large-uncased-whole-word-masking-finetuned-squad``: The ``bert-large-uncased-whole-word-masking`` model finetuned on SQuAD (using the ``run_bert_squad.py`` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
+    * ``openai-gpt``: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``gpt2``: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
+    * ``gpt2-medium``: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
+    * ``transfo-xl-wt103``: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
+
+  *
+    a path or url to a pretrained model archive containing:
+
+
+    * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
+    * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
+
+  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/pytorch_pretrained_bert/modeling.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
+
+*
+  ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
+
+* ``from_tf``\ : should we load the weights from a locally saved TensorFlow checkpoint
+* ``state_dict``\ : an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+* ``*inputs``\ , `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
+
+``Uncased`` means that the text has been lowercased before WordPiece tokenization, e.g., ``John Smith`` becomes ``john smith``. The Uncased model also strips out any accent markers. ``Cased`` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the `Multilingual README <https://github.com/google-research/bert/blob/master/multilingual.md>`__ or the original TensorFlow repository.
+
+When using an ``uncased model``\ , make sure to pass ``--do_lower_case`` to the example training scripts (or pass ``do_lower_case=True`` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).
+
+Examples:
+
+.. code-block:: python
+
+   # BERT
+   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
+   model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+   # OpenAI GPT
+   tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+   model = OpenAIGPTModel.from_pretrained('openai-gpt')
+
+   # Transformer-XL
+   tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+   model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
+
+   # OpenAI GPT-2
+   tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+   model = GPT2Model.from_pretrained('gpt2')
+
+Cache directory
+~~~~~~~~~~~~~~~
+
+``pytorch_pretrained_bert`` save the pretrained weights in a cache directory which is located at (in this order of priority):
+
+
+* ``cache_dir`` optional arguments to the ``from_pretrained()`` method (see above),
+* shell environment variable ``PYTORCH_PRETRAINED_BERT_CACHE``\ ,
+* PyTorch cache home + ``/pytorch_pretrained_bert/``
+  where PyTorch cache home is defined by (in this order):
+
+  * shell environment variable ``ENV_TORCH_HOME``
+  * shell environment variable ``ENV_XDG_CACHE_HOME`` + ``/torch/``\ )
+  * default: ``~/.cache/torch/``
+
+Usually, if you don't set any specific environment variable, ``pytorch_pretrained_bert`` cache will be at ``~/.cache/torch/pytorch_pretrained_bert/``.
+
+You can alsways safely delete ``pytorch_pretrained_bert`` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
+
+Serialization best-practices
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
+There are three types of files you need to save to be able to reload a fine-tuned model:
+
+
+* the model it-self which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`__\ ,
+* the configuration file of the model which is saved as a JSON file, and
+* the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
+
+The *default filenames* of these files are as follow:
+
+
+* the model weights file: ``pytorch_model.bin``\ ,
+* the configuration file: ``config.json``\ ,
+* the vocabulary file: ``vocab.txt`` for BERT and Transformer-XL, ``vocab.json`` for GPT/GPT-2 (BPE vocabulary),
+* for GPT/GPT-2 (BPE vocabulary) the additional merges file: ``merges.txt``.
+
+**If you save a model using these *default filenames*\ , you can then re-load the model and tokenizer using the ``from_pretrained()`` method.**
+
+Here is the recommended way of saving the model, configuration and vocabulary to an ``output_dir`` directory and reloading the model and tokenizer afterwards:
+
+.. code-block:: python
+
+   from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
+
+   output_dir = "./models/"
+
+   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
+
+   # If we have a distributed model, save only the encapsulated model
+   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
+   model_to_save = model.module if hasattr(model, 'module') else model
+
+   # If we save using the predefined names, we can load using `from_pretrained`
+   output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
+   output_config_file = os.path.join(output_dir, CONFIG_NAME)
+
+   torch.save(model_to_save.state_dict(), output_model_file)
+   model_to_save.config.to_json_file(output_config_file)
+   tokenizer.save_vocabulary(output_dir)
+
+   # Step 2: Re-load the saved model and vocabulary
+
+   # Example for a Bert model
+   model = BertForQuestionAnswering.from_pretrained(output_dir)
+   tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)  # Add specific options if needed
+   # Example for a GPT model
+   model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
+   tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
+
+Here is another way you can save and reload the model if you want to use specific paths for each type of files:
+
+.. code-block:: python
+
+   output_model_file = "./models/my_own_model_file.bin"
+   output_config_file = "./models/my_own_config_file.bin"
+   output_vocab_file = "./models/my_own_vocab_file.bin"
+
+   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
+
+   # If we have a distributed model, save only the encapsulated model
+   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
+   model_to_save = model.module if hasattr(model, 'module') else model
+
+   torch.save(model_to_save.state_dict(), output_model_file)
+   model_to_save.config.to_json_file(output_config_file)
+   tokenizer.save_vocabulary(output_vocab_file)
+
+   # Step 2: Re-load the saved model and vocabulary
+
+   # We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`.
+   # Here is how to do it in this situation:
+
+   # Example for a Bert model
+   config = BertConfig.from_json_file(output_config_file)
+   model = BertForQuestionAnswering(config)
+   state_dict = torch.load(output_model_file)
+   model.load_state_dict(state_dict)
+   tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case)
+
+   # Example for a GPT model
+   config = OpenAIGPTConfig.from_json_file(output_config_file)
+   model = OpenAIGPTDoubleHeadsModel(config)
+   state_dict = torch.load(output_model_file)
+   model.load_state_dict(state_dict)
+   tokenizer = OpenAIGPTTokenizer(output_vocab_file)
+
+Learning Rate Schedules
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``.optimization`` module also provides additional schedules in the form of schedule objects that inherit from ``_LRSchedule``.
+All ``_LRSchedule`` subclasses accept ``warmup`` and ``t_total`` arguments at construction.
+When an ``_LRSchedule`` object is passed into ``AdamW``\ ,
+the ``warmup`` and ``t_total`` arguments on the optimizer are ignored and the ones in the ``_LRSchedule`` object are used.
+An overview of the implemented schedules:
+
+
+* ``ConstantLR``\ : always returns learning rate 1.
+* ``WarmupConstantSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps. \
+    Keeps learning rate equal to 1. after warmup.
+
+  .. image:: /imgs/warmup_constant_schedule.png
+     :target: /imgs/warmup_constant_schedule.png
+     :alt:
+
+
+* ``WarmupLinearSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps. \
+    Linearly decreases learning rate from 1. to 0. over remaining ``1 - warmup`` steps.
+
+  .. image:: /imgs/warmup_linear_schedule.png
+     :target: /imgs/warmup_linear_schedule.png
+     :alt:
+
+
+* ``WarmupCosineSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps. \
+  Decreases learning rate from 1. to 0. over remaining ``1 - warmup`` steps following a cosine curve. \
+  If ``cycles`` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+
+  .. image:: /imgs/warmup_cosine_schedule.png
+     :target: /imgs/warmup_cosine_schedule.png
+     :alt:
+
+
+* ``WarmupCosineWithHardRestartsSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
+  If ``cycles`` (default=1.) is different from default, learning rate follows ``cycles`` times a cosine decaying learning rate (with hard restarts).
+
+  .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
+     :target: /imgs/warmup_cosine_hard_restarts_schedule.png
+     :alt:
+
+
+* ``WarmupCosineWithWarmupRestartsSchedule`` : All training progress is divided in ``cycles`` (default=1.) parts of equal length.
+  Every part follows a schedule with the first ``warmup`` fraction of the training steps linearly increasing from 0. to 1.,
+  followed by a learning rate decreasing from 1. to 0. following a cosine curve.
+  Note that the total number of all warmup steps over all cycles together is equal to ``warmup`` * ``cycles``
+
+  .. image:: /imgs/warmup_cosine_warm_restarts_schedule.png
+     :target: /imgs/warmup_cosine_warm_restarts_schedule.png
+     :alt:
--- a/docs/source/model_doc/transformerxl.rst
+++ b/docs/source/model_doc/transformerxl.rst
@@ -0,0 +1,30 @@
+Transformer XL
+----------------------------------------------------
+
+
+``TransfoXLConfig``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.TransfoXLConfig
+    :members:
+
+
+``TransfoXLTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.TransfoXLTokenizer
+    :members:
+
+
+``TransfoXLModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.TransfoXLModel
+    :members:
+
+
+``TransfoXLLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.TransfoXLLMHeadModel
+    :members:
--- a/docs/source/model_doc/xlm.rst
+++ b/docs/source/model_doc/xlm.rst
@@ -0,0 +1,41 @@
+XLM
+----------------------------------------------------
+
+``XLMConfig``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLMConfig
+    :members:
+
+``XLMTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLMTokenizer
+    :members:
+
+``XLMModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLMModel
+    :members:
+
+
+``XLMWithLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLMWithLMHeadModel
+    :members:
+
+
+``XLMForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLMForSequenceClassification
+    :members:
+
+
+``XLMForQuestionAnswering``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLMForQuestionAnswering
+    :members:
--- a/docs/source/model_doc/xlnet.rst
+++ b/docs/source/model_doc/xlnet.rst
@@ -0,0 +1,43 @@
+XLNet
+----------------------------------------------------
+
+``XLNetConfig``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLNetConfig
+    :members:
+
+
+``XLNetTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLNetTokenizer
+    :members:
+
+
+``XLNetModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLNetModel
+    :members:
+
+
+``XLNetLMHeadModel``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLNetLMHeadModel
+    :members:
+
+
+``XLNetForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLNetForSequenceClassification
+    :members:
+
+
+``XLNetForQuestionAnswering``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.XLNetForQuestionAnswering
+    :members:
--- a/docs/source/notebooks.rst
+++ b/docs/source/notebooks.rst
@@ -0,0 +1,16 @@
+Notebooks
+================================================
+
+We include `three Jupyter Notebooks <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks>`_ that can be used to check that the predictions of the PyTorch model are identical to the predictions of the original TensorFlow model.
+
+
+*
+  The first NoteBook (\ `Comparing-TF-and-PT-models.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks/Comparing-TF-and-PT-models.ipynb>`_\ ) extracts the hidden states of a full sequence on each layers of the TensorFlow and the PyTorch models and computes the standard deviation between them. In the given example, we get a standard deviation of 1.5e-7 to 9e-7 on the various hidden state of the models.
+
+*
+  The second NoteBook (\ `Comparing-TF-and-PT-models-SQuAD.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/tree/master/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb>`_\ ) compares the loss computed by the TensorFlow and the PyTorch models for identical initialization of the fine-tuning layer of the ``BertForQuestionAnswering`` and computes the standard deviation between them. In the given example, we get a standard deviation of 2.5e-7 between the models.
+
+*
+  The third NoteBook (\ `Comparing-TF-and-PT-models-MLM-NSP.ipynb <https://github.com/huggingface/pytorch-pretrained-BERT/tree/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb>`_\ ) compares the predictions computed by the TensorFlow and the PyTorch models for masked token language modeling using the pre-trained masked language modeling model.
+
+Please follow the instructions given in the notebooks to run and modify them.
--- a/docs/source/philosophy.md
+++ b/docs/source/philosophy.md
@@ -0,0 +1 @@
+# Philosophy
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -0,0 +1,171 @@
+### Loading Google AI or OpenAI pre-trained weights or PyTorch dump
+
+### `from_pretrained()` method
+
+To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated using the `from_pretrained()` method:
+
+```python
+model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
+```
+
+where
+
+- `BERT_CLASS` is either a tokenizer to load the vocabulary (`BertTokenizer` or `OpenAIGPTTokenizer` classes) or one of the eight BERT or three OpenAI GPT PyTorch model classes (to load the pre-trained weights): `BertModel`, `BertForMaskedLM`, `BertForNextSentencePrediction`, `BertForPreTraining`, `BertForSequenceClassification`, `BertForTokenClassification`, `BertForMultipleChoice`, `BertForQuestionAnswering`, `OpenAIGPTModel`, `OpenAIGPTLMHeadModel` or `OpenAIGPTDoubleHeadsModel`, and
+- `PRE_TRAINED_MODEL_NAME_OR_PATH` is either:
+
+  - the shortcut name of a Google AI's or OpenAI's pre-trained model selected in the list:
+
+    - `bert-base-uncased`: 12-layer, 768-hidden, 12-heads, 110M parameters
+    - `bert-large-uncased`: 24-layer, 1024-hidden, 16-heads, 340M parameters
+    - `bert-base-cased`: 12-layer, 768-hidden, 12-heads , 110M parameters
+    - `bert-large-cased`: 24-layer, 1024-hidden, 16-heads, 340M parameters
+    - `bert-base-multilingual-uncased`: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+    - `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+    - `bert-base-chinese`: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
+    - `bert-base-german-cased`: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters [Performance Evaluation](https://deepset.ai/german-bert)
+    - `bert-large-uncased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    - `bert-large-cased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    - `bert-large-uncased-whole-word-masking-finetuned-squad`: The `bert-large-uncased-whole-word-masking` model finetuned on SQuAD (using the `run_bert_squad.py` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
+    - `openai-gpt`: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
+    - `gpt2`: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
+    - `gpt2-medium`: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
+    - `transfo-xl-wt103`: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
+
+  - a path or url to a pretrained model archive containing:
+
+    - `bert_config.json` or `openai_gpt_config.json` a configuration file for the model, and
+    - `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining`, `OpenAIGPTModel`, `TransfoXLModel`, `GPT2LMHeadModel` (saved with the usual `torch.save()`)
+
+  If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_transformers/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_transformers/`).
+
+- `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
+- `from_tf`: should we load the weights from a locally saved TensorFlow checkpoint
+- `state_dict`: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+- `*inputs`, `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
+
+`Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository.
+
+**When using an `uncased model`, make sure to pass `--do_lower_case` to the example training scripts (or pass `do_lower_case=True` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).**
+
+Examples:
+
+```python
+# BERT
+tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
+model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+
+# OpenAI GPT
+tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+model = OpenAIGPTModel.from_pretrained('openai-gpt')
+
+# Transformer-XL
+tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
+
+# OpenAI GPT-2
+tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+model = GPT2Model.from_pretrained('gpt2')
+
+```
+
+#### Cache directory
+
+`pytorch_transformers` save the pretrained weights in a cache directory which is located at (in this order of priority):
+
+- `cache_dir` optional arguments to the `from_pretrained()` method (see above),
+- shell environment variable `PYTORCH_PRETRAINED_BERT_CACHE`,
+- PyTorch cache home + `/pytorch_transformers/`
+  where PyTorch cache home is defined by (in this order):
+  - shell environment variable `ENV_TORCH_HOME`
+  - shell environment variable `ENV_XDG_CACHE_HOME` + `/torch/`)
+  - default: `~/.cache/torch/`
+
+Usually, if you don't set any specific environment variable, `pytorch_transformers` cache will be at `~/.cache/torch/pytorch_transformers/`.
+
+You can alsways safely delete `pytorch_transformers` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
+
+### Serialization best-practices
+
+This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
+There are three types of files you need to save to be able to reload a fine-tuned model:
+
+- the model it-self which should be saved following PyTorch serialization [best practices](https://pytorch.org/docs/stable/notes/serialization.html#best-practices),
+- the configuration file of the model which is saved as a JSON file, and
+- the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
+
+The *default filenames* of these files are as follow:
+
+- the model weights file: `pytorch_model.bin`,
+- the configuration file: `config.json`,
+- the vocabulary file: `vocab.txt` for BERT and Transformer-XL, `vocab.json` for GPT/GPT-2 (BPE vocabulary),
+- for GPT/GPT-2 (BPE vocabulary) the additional merges file: `merges.txt`.
+
+**If you save a model using these *default filenames*, you can then re-load the model and tokenizer using the `from_pretrained()` method.**
+
+Here is the recommended way of saving the model, configuration and vocabulary to an `output_dir` directory and reloading the model and tokenizer afterwards:
+
+```python
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+
+output_dir = "./models/"
+
+# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
+
+# If we have a distributed model, save only the encapsulated model
+# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
+model_to_save = model.module if hasattr(model, 'module') else model
+
+# If we save using the predefined names, we can load using `from_pretrained`
+output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
+output_config_file = os.path.join(output_dir, CONFIG_NAME)
+
+torch.save(model_to_save.state_dict(), output_model_file)
+model_to_save.config.to_json_file(output_config_file)
+tokenizer.save_vocabulary(output_dir)
+
+# Step 2: Re-load the saved model and vocabulary
+
+# Example for a Bert model
+model = BertForQuestionAnswering.from_pretrained(output_dir)
+tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)  # Add specific options if needed
+# Example for a GPT model
+model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
+tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
+```
+
+Here is another way you can save and reload the model if you want to use specific paths for each type of files:
+
+```python
+output_model_file = "./models/my_own_model_file.bin"
+output_config_file = "./models/my_own_config_file.bin"
+output_vocab_file = "./models/my_own_vocab_file.bin"
+
+# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
+
+# If we have a distributed model, save only the encapsulated model
+# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
+model_to_save = model.module if hasattr(model, 'module') else model
+
+torch.save(model_to_save.state_dict(), output_model_file)
+model_to_save.config.to_json_file(output_config_file)
+tokenizer.save_vocabulary(output_vocab_file)
+
+# Step 2: Re-load the saved model and vocabulary
+
+# We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`.
+# Here is how to do it in this situation:
+
+# Example for a Bert model
+config = BertConfig.from_json_file(output_config_file)
+model = BertForQuestionAnswering(config)
+state_dict = torch.load(output_model_file)
+model.load_state_dict(state_dict)
+tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case)
+
+# Example for a GPT model
+config = OpenAIGPTConfig.from_json_file(output_config_file)
+model = OpenAIGPTDoubleHeadsModel(config)
+state_dict = torch.load(output_model_file)
+model.load_state_dict(state_dict)
+tokenizer = OpenAIGPTTokenizer(output_vocab_file)
+```
--- a/docs/source/torchscript.rst
+++ b/docs/source/torchscript.rst
@@ -0,0 +1,132 @@
+TorchScript
+================================================
+
+.. note::
+    This is the very beginning of our experiments with TorchScript and we are still exploring its capabilities
+    with variable-input-size models. It is a focus of interest to us and we will deepen our analysis in upcoming
+    releases, with more code examples, a more flexible implementation, and benchmarks comparing python-based codes
+    with compiled TorchScript.
+
+
+According to Pytorch's documentation: "TorchScript is a way to create serializable and optimizable models from PyTorch code".
+Pytorch's two modules `JIT and TRACE <https://pytorch.org/docs/stable/jit.html>`_ allow the developer to export
+their model to be re-used in other programs, such as efficiency-oriented C++ programs.
+
+We have provided an interface that allows the export of `pytorch-transformers` models to TorchScript so that they can
+be reused in a different environment than a Pytorch-based python program. Here we explain how to use our models so that
+they can be exported, and what to be mindful of when using these models with TorchScript.
+
+Exporting a model needs two things:
+
+* dummy inputs to execute a model forward pass.
+* the model needs to be instantiated with the ``torchscript`` flag.
+
+These necessities imply several things developers should be careful about. These are detailed below.
+
+
+Implications
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+TorchScript flag and tied weights
+------------------------------------------------
+This flag is necessary because most of the language models in this repository have tied weights between their
+``Embedding`` layer and their ``Decoding`` layer. TorchScript does not allow the export of models that have tied weights,
+it is therefore necessary to untie the weights beforehand.
+
+This implies that models instantiated with the ``torchscript`` flag have their ``Embedding`` layer and ``Decoding`` layer
+separate, which means that they should not be trained down the line. Training would de-synchronize the two layers,
+leading to unexpected results.
+
+This is not the case for models that do not have a Language Model head, as those do not have tied weights. These models
+can be safely exported without the ``torchscript`` flag.
+
+Dummy inputs and standard lengths
+------------------------------------------------
+
+The dummy inputs are used to do a model forward pass. While the inputs' values are propagating through the layers,
+Pytorch keeps track of the different operations executed on each tensor. These recorded operations are then used
+to create the "trace" of the model.
+
+The trace is created relatively to the inputs' dimensions. It is therefore constrained by the dimensions of the dummy
+input, and will not work for any other sequence length or batch size. When trying with a different size, an error such
+as:
+
+``The expanded size of the tensor (3) must match the existing size (7) at non-singleton dimension 2``
+
+will be raised. It is therefore recommended to trace the model with a dummy input size at least as large as the largest
+input that will be fed to the model during inference. Padding can be performed to fill the missing values. As the model
+will have been traced with a large input size however, the dimensions of the different matrix will be large as well,
+resulting in more calculations.
+
+It is recommended to be careful of the total number of operations done on each input and to follow performance closely
+when exporting varying sequence-length models.
+
+Using TorchScript in Python
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Below are examples of using the Python to save, load models as well as how to use the trace for inference.
+
+Saving a model
+------------------------------------------------
+
+This snippet shows how to use TorchScript to export a ``BertModel``. Here the ``BertModel`` is instantiated
+according to a ``BertConfig`` class and then saved to disk under the filename ``traced_bert.pt``
+
+.. code-block:: python
+
+    from pytorch_pretrained_bert import BertModel, BertTokenizer, BertConfig
+    import torch
+
+    enc = BertTokenizer.from_pretrained("bert-base-uncased")
+
+    # Tokenizing input text
+    text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+    tokenized_text = enc.tokenize(text)
+
+    # Masking one of the input tokens
+    masked_index = 8
+    tokenized_text[masked_index] = '[MASK]'
+    indexed_tokens = enc.convert_tokens_to_ids(tokenized_text)
+    segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+
+    # Creating a dummy input
+    tokens_tensor = torch.tensor([indexed_tokens])
+    segments_tensors = torch.tensor([segments_ids])
+    dummy_input = [tokens_tensor, segments_tensors]
+
+    # Initializing the model with the torchscript flag
+    # Flag set to True even though it is not necessary as this model does not have an LM Head.
+    config = BertConfig(vocab_size_or_config_json_file=32000, hidden_size=768,
+        num_hidden_layers=12, num_attention_heads=12, intermediate_size=3072, torchscript=True)
+
+    # Instantiating the model
+    model = BertModel(config)
+
+    # The model needs to be in evaluation mode
+    model.eval()
+
+    # Creating the trace
+    traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
+    torch.jit.save(traced_model, "traced_bert.pt")
+
+Loading a model
+------------------------------------------------
+
+This snippet shows how to load the ``BertModel`` that was previously saved to disk under the name ``traced_bert.pt``.
+We are re-using the previously initialised ``dummy_input``.
+
+.. code-block:: python
+
+    loaded_model = torch.jit.load("traced_model.pt")
+    loaded_model.eval()
+
+    all_encoder_layers, pooled_output = loaded_model(dummy_input)
+
+Using a traced model for inference
+------------------------------------------------
+
+Using the traced model for inference is as simple as using its ``__call__`` dunder method:
+
+.. code-block:: python
+
+    traced_model(tokens_tensor, segments_tensors)
--- a/docs/source/usage.rst
+++ b/docs/source/usage.rst
@@ -0,0 +1,339 @@
+Usage
+================================================
+
+BERT
+^^^^
+
+Here is a quick-start example using ``BertTokenizer``\ , ``BertModel`` and ``BertForMaskedLM`` class with Google AI's pre-trained ``Bert base uncased`` model. See the `doc section <./model_doc/overview.html>`_ below for all the details on these classes.
+
+First let's prepare a tokenized input with ``BertTokenizer``
+
+.. code-block:: python
+
+   import torch
+   from pytorch_transformers import BertTokenizer, BertModel, BertForMaskedLM
+
+   # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
+   import logging
+   logging.basicConfig(level=logging.INFO)
+
+   # Load pre-trained model tokenizer (vocabulary)
+   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+   # Tokenized input
+   text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+   tokenized_text = tokenizer.tokenize(text)
+
+   # Mask a token that we will try to predict back with `BertForMaskedLM`
+   masked_index = 8
+   tokenized_text[masked_index] = '[MASK]'
+   assert tokenized_text == ['[CLS]', 'who', 'was', 'jim', 'henson', '?', '[SEP]', 'jim', '[MASK]', 'was', 'a', 'puppet', '##eer', '[SEP]']
+
+   # Convert token to vocabulary indices
+   indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+   # Define sentence A and B indices associated to 1st and 2nd sentences (see paper)
+   segments_ids = [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]
+
+   # Convert inputs to PyTorch tensors
+   tokens_tensor = torch.tensor([indexed_tokens])
+   segments_tensors = torch.tensor([segments_ids])
+
+Let's see how to use ``BertModel`` to get hidden states
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = BertModel.from_pretrained('bert-base-uncased')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor = tokens_tensor.to('cuda')
+   segments_tensors = segments_tensors.to('cuda')
+   model.to('cuda')
+
+   # Predict hidden states features for each layer
+   with torch.no_grad():
+       encoded_layers, _ = model(tokens_tensor, segments_tensors)
+   # We have a hidden states for each of the 12 layers in model bert-base-uncased
+   assert len(encoded_layers) == 12
+
+And how to use ``BertForMaskedLM``
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = BertForMaskedLM.from_pretrained('bert-base-uncased')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor = tokens_tensor.to('cuda')
+   segments_tensors = segments_tensors.to('cuda')
+   model.to('cuda')
+
+   # Predict all tokens
+   with torch.no_grad():
+       predictions = model(tokens_tensor, segments_tensors)
+
+   # confirm we were able to predict 'henson'
+   predicted_index = torch.argmax(predictions[0, masked_index]).item()
+   predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+   assert predicted_token == 'henson'
+
+OpenAI GPT
+^^^^^^^^^^
+
+Here is a quick-start example using ``OpenAIGPTTokenizer``\ , ``OpenAIGPTModel`` and ``OpenAIGPTLMHeadModel`` class with OpenAI's pre-trained  model. See the `doc section <./model_doc/overview.html>`_ for all the details on these classes.
+
+First let's prepare a tokenized input with ``OpenAIGPTTokenizer``
+
+.. code-block:: python
+
+   import torch
+   from pytorch_transformers import OpenAIGPTTokenizer, OpenAIGPTModel, OpenAIGPTLMHeadModel
+
+   # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
+   import logging
+   logging.basicConfig(level=logging.INFO)
+
+   # Load pre-trained model tokenizer (vocabulary)
+   tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+
+   # Tokenized input
+   text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+   tokenized_text = tokenizer.tokenize(text)
+
+   # Convert token to vocabulary indices
+   indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+
+   # Convert inputs to PyTorch tensors
+   tokens_tensor = torch.tensor([indexed_tokens])
+
+Let's see how to use ``OpenAIGPTModel`` to get hidden states
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = OpenAIGPTModel.from_pretrained('openai-gpt')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor = tokens_tensor.to('cuda')
+   model.to('cuda')
+
+   # Predict hidden states features for each layer
+   with torch.no_grad():
+       hidden_states = model(tokens_tensor)
+
+And how to use ``OpenAIGPTLMHeadModel``
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor = tokens_tensor.to('cuda')
+   model.to('cuda')
+
+   # Predict all tokens
+   with torch.no_grad():
+       predictions = model(tokens_tensor)
+
+   # get the predicted last token
+   predicted_index = torch.argmax(predictions[0, -1, :]).item()
+   predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+   assert predicted_token == '.</w>'
+
+And how to use ``OpenAIGPTDoubleHeadsModel``
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+   model.eval()
+
+   #  Prepare tokenized input
+   text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+   text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+   tokenized_text1 = tokenizer.tokenize(text1)
+   tokenized_text2 = tokenizer.tokenize(text2)
+   indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+   indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+   tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+   mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+   # Predict hidden states features for each layer
+   with torch.no_grad():
+       lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
+
+Transformer-XL
+^^^^^^^^^^^^^^
+
+Here is a quick-start example using ``TransfoXLTokenizer``\ , ``TransfoXLModel`` and ``TransfoXLModelLMHeadModel`` class with the Transformer-XL model pre-trained on WikiText-103. See the `doc section <./model_doc/overview.html>`_ for all the details on these classes.
+
+First let's prepare a tokenized input with ``TransfoXLTokenizer``
+
+.. code-block:: python
+
+   import torch
+   from pytorch_transformers import TransfoXLTokenizer, TransfoXLModel, TransfoXLLMHeadModel
+
+   # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
+   import logging
+   logging.basicConfig(level=logging.INFO)
+
+   # Load pre-trained model tokenizer (vocabulary from wikitext 103)
+   tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+
+   # Tokenized input
+   text_1 = "Who was Jim Henson ?"
+   text_2 = "Jim Henson was a puppeteer"
+   tokenized_text_1 = tokenizer.tokenize(text_1)
+   tokenized_text_2 = tokenizer.tokenize(text_2)
+
+   # Convert token to vocabulary indices
+   indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
+   indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
+
+   # Convert inputs to PyTorch tensors
+   tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+   tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+Let's see how to use ``TransfoXLModel`` to get hidden states
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor_1 = tokens_tensor_1.to('cuda')
+   tokens_tensor_2 = tokens_tensor_2.to('cuda')
+   model.to('cuda')
+
+   with torch.no_grad():
+       # Predict hidden states features for each layer
+       hidden_states_1, mems_1 = model(tokens_tensor_1)
+       # We can re-use the memory cells in a subsequent call to attend a longer context
+       hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
+
+And how to use ``TransfoXLLMHeadModel``
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor_1 = tokens_tensor_1.to('cuda')
+   tokens_tensor_2 = tokens_tensor_2.to('cuda')
+   model.to('cuda')
+
+   with torch.no_grad():
+       # Predict all tokens
+       predictions_1, mems_1 = model(tokens_tensor_1)
+       # We can re-use the memory cells in a subsequent call to attend a longer context
+       predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
+
+   # get the predicted last token
+   predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+   predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+   assert predicted_token == 'who'
+
+OpenAI GPT-2
+^^^^^^^^^^^^
+
+Here is a quick-start example using ``GPT2Tokenizer``\ , ``GPT2Model`` and ``GPT2LMHeadModel`` class with OpenAI's pre-trained  model. See the `doc section <./model_doc/overview.html>`_ for all the details on these classes.
+
+First let's prepare a tokenized input with ``GPT2Tokenizer``
+
+.. code-block:: python
+
+   import torch
+   from pytorch_transformers import GPT2Tokenizer, GPT2Model, GPT2LMHeadModel
+
+   # OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
+   import logging
+   logging.basicConfig(level=logging.INFO)
+
+   # Load pre-trained model tokenizer (vocabulary)
+   tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+
+   # Encode some inputs
+   text_1 = "Who was Jim Henson ?"
+   text_2 = "Jim Henson was a puppeteer"
+   indexed_tokens_1 = tokenizer.encode(text_1)
+   indexed_tokens_2 = tokenizer.encode(text_2)
+
+   # Convert inputs to PyTorch tensors
+   tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+   tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+Let's see how to use ``GPT2Model`` to get hidden states
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = GPT2Model.from_pretrained('gpt2')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor_1 = tokens_tensor_1.to('cuda')
+   tokens_tensor_2 = tokens_tensor_2.to('cuda')
+   model.to('cuda')
+
+   # Predict hidden states features for each layer
+   with torch.no_grad():
+       hidden_states_1, past = model(tokens_tensor_1)
+       # past can be used to reuse precomputed hidden state in a subsequent predictions
+       # (see beam-search examples in the run_gpt2.py example).
+       hidden_states_2, past = model(tokens_tensor_2, past=past)
+
+And how to use ``GPT2LMHeadModel``
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = GPT2LMHeadModel.from_pretrained('gpt2')
+   model.eval()
+
+   # If you have a GPU, put everything on cuda
+   tokens_tensor_1 = tokens_tensor_1.to('cuda')
+   tokens_tensor_2 = tokens_tensor_2.to('cuda')
+   model.to('cuda')
+
+   # Predict all tokens
+   with torch.no_grad():
+       predictions_1, past = model(tokens_tensor_1)
+       # past can be used to reuse precomputed hidden state in a subsequent predictions
+       # (see beam-search examples in the run_gpt2.py example).
+       predictions_2, past = model(tokens_tensor_2, past=past)
+
+   # get the predicted last token
+   predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+   predicted_token = tokenizer.decode([predicted_index])
+
+And how to use ``GPT2DoubleHeadsModel``
+
+.. code-block:: python
+
+   # Load pre-trained model (weights)
+   model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
+   model.eval()
+
+   #  Prepare tokenized input
+   text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+   text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+   tokenized_text1 = tokenizer.tokenize(text1)
+   tokenized_text2 = tokenizer.tokenize(text2)
+   indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+   indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+   tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+   mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+   # Predict hidden states features for each layer
+   with torch.no_grad():
+       lm_logits, multiple_choice_logits, past = model(tokens_tensor, mc_token_ids)
--- a/examples/bertology.py
+++ b/examples/bertology.py
@@ -1,310 +0,0 @@
-#!/usr/bin/env python3
-import os
-import argparse
-import logging
-from datetime import timedelta, datetime
-from tqdm import tqdm
-
-import numpy as np
-
-import torch
-from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subset
-from torch.utils.data.distributed import DistributedSampler
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from pytorch_pretrained_bert import BertForSequenceClassification, BertTokenizer
-
-from run_classifier_dataset_utils import processors, output_modes, convert_examples_to_features, compute_metrics
-
-
-logger = logging.getLogger(__name__)
-
-
-def entropy(p):
-    plogp = p * torch.log(p)
-    plogp[p == 0] = 0
-    return -plogp.sum(dim=-1)
-
-
-def print_1d_tensor(tensor, prefix=""):
-    if tensor.dtype != torch.long:
-        logger.info(prefix + "\t".join(f"{x:.5f}" for x in tensor.cpu().data))
-    else:
-        logger.info(prefix + "\t".join(f"{x:d}" for x in tensor.cpu().data))
-
-
-def print_2d_tensor(tensor):
-    logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
-    for row in range(len(tensor)):
-        print_1d_tensor(tensor[row], prefix=f"layer {row + 1}:\t")
-
-
-def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None):
-    """ Example on how to use model outputs to compute:
-        - head attention entropy (activated by setting output_attentions=True when we created the model
-        - head importance scores according to http://arxiv.org/abs/1905.10650
-            (activated by setting keep_multihead_output=True when we created the model)
-    """
-    # Prepare our tensors
-    n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads
-    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
-    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
-    preds = None
-    labels = None
-    tot_tokens = 0.0
-
-    for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-        batch = tuple(t.to(args.device) for t in batch)
-        input_ids, input_mask, segment_ids, label_ids = batch
-
-        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
-        all_attentions, logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, head_mask=head_mask)
-
-        if compute_entropy:
-            # Update head attention entropy
-            for layer, attn in enumerate(all_attentions):
-                masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
-                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()
-
-        if compute_importance:
-            # Update head importance scores with regards to our loss
-            # First, backpropagate to populate the gradients
-            if args.output_mode == "classification":
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, args.num_labels), label_ids.view(-1))
-            elif args.output_mode == "regression":
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), label_ids.view(-1))
-            loss.backward()
-            # Second, compute importance scores according to http://arxiv.org/abs/1905.10650
-            multihead_outputs = model.bert.get_multihead_outputs()
-            for layer, mh_layer_output in enumerate(multihead_outputs):
-                dot = torch.einsum("bhli,bhli->bhl", [mh_layer_output.grad, mh_layer_output])
-                head_importance[layer] += dot.abs().sum(-1).sum(0).detach()
-
-        # Also store our logits/labels if we want to compute metrics afterwards
-        if preds is None:
-            preds = logits.detach().cpu().numpy()
-            labels = label_ids.detach().cpu().numpy()
-        else:
-            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
-            labels = np.append(labels, label_ids.detach().cpu().numpy(), axis=0)
-
-        tot_tokens += input_mask.float().detach().sum().data
-
-    # Normalize
-    attn_entropy /= tot_tokens
-    head_importance /= tot_tokens
-    # Layerwise importance normalization
-    if not args.dont_normalize_importance_by_layer:
-        exponent = 2
-        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1/exponent)
-        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20
-
-    if not args.dont_normalize_global_importance:
-        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
-
-    return attn_entropy, head_importance, preds, labels
-
-
-def run_model():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name_or_path', type=str, default='bert-base-cased-finetuned-mrpc', help='pretrained model name or path to local checkpoint')
-    parser.add_argument("--task_name", type=str, default='mrpc', help="The name of the task to train.")
-    parser.add_argument("--data_dir", type=str, required=True, help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--output_dir", type=str, required=True, help="The output directory where the model predictions and checkpoints will be written.")
-    parser.add_argument("--data_subset", type=int, default=-1, help="If > 0: limit the data to a subset of data_subset instances.")
-    parser.add_argument("--overwrite_output_dir", action='store_true', help="Whether to overwrite data in output directory")
-
-    parser.add_argument("--dont_normalize_importance_by_layer", action='store_true', help="Don't normalize importance score by layers")
-    parser.add_argument("--dont_normalize_global_importance", action='store_true', help="Don't normalize all importance scores between 0 and 1")
-
-    parser.add_argument("--try_masking", action='store_true', help="Whether to try to mask head until a threshold of accuracy.")
-    parser.add_argument("--masking_threshold", default=0.9, type=float, help="masking threshold in term of metrics"
-                                                                             "(stop masking when metric < threshold * original metric value).")
-    parser.add_argument("--masking_amount", default=0.1, type=float, help="Amount to heads to masking at each masking step.")
-    parser.add_argument("--metric_name", default="acc", type=str, help="Metric to use for head masking.")
-
-    parser.add_argument("--max_seq_length", default=128, type=int, help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, and sequences shorter \n"
-                             "than this will be padded.")
-    parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
-
-    parser.add_argument("--seed", type=int, default=42)
-    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
-    args = parser.parse_args()
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    # Setup devices and distributed training
-    if args.local_rank == -1 or args.no_cuda:
-        args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        args.device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        torch.distributed.init_process_group(backend='nccl')  # Initializes the distributed backend
-
-    # Setup logging
-    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, n_gpu, bool(args.local_rank != -1)))
-
-    # Set seeds
-    np.random.seed(args.seed)
-    torch.random.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed(args.seed)
-
-    # Prepare GLUE task
-    task_name = args.task_name.lower()
-    processor = processors[task_name]()
-    label_list = processor.get_labels()
-    args.output_mode = output_modes[task_name]
-    args.num_labels = len(label_list)
-
-    # Prepare output directory
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    # Load model & tokenizer
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only one distributed process download model & vocab
-    tokenizer = BertTokenizer.from_pretrained(args.model_name_or_path)
-
-    # Load a model with all BERTology options on:
-    #   output_attentions => will output attention weights
-    #   keep_multihead_output => will store gradient of attention head outputs for head importance computation
-    #       see: http://arxiv.org/abs/1905.10650
-    model = BertForSequenceClassification.from_pretrained(args.model_name_or_path,
-                                                          num_labels=args.num_labels,
-                                                          output_attentions=True,
-                                                          keep_multihead_output=True)
-    if args.local_rank == 0:
-        torch.distributed.barrier()  # Make sure only one distributed process download model & vocab
-    model.to(args.device)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True)
-    model.eval()
-
-    # Prepare dataset for the GLUE task
-    eval_examples = processor.get_dev_examples(args.data_dir)
-    cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
-        list(filter(None, args.model_name_or_path.split('/'))).pop(), str(args.max_seq_length), str(task_name)))
-    try:
-        eval_features = torch.load(cached_eval_features_file)
-    except:
-        eval_features = convert_examples_to_features(eval_examples, label_list, args.max_seq_length, tokenizer, args.output_mode)
-        if args.local_rank in [-1, 0]:
-            logger.info("Saving eval features to cache file %s", cached_eval_features_file)
-            torch.save(eval_features, cached_eval_features_file)
-
-    all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-    all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-    all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long if args.output_mode == "classification" else torch.float)
-    eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-
-    if args.data_subset > 0:
-        eval_data = Subset(eval_data, list(range(min(args.data_subset, len(eval_data)))))
-
-    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
-
-    # Print/save training arguments
-    print(args)
-    torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
-
-    # Compute head entropy and importance score
-    attn_entropy, head_importance, _, _ = compute_heads_importance(args, model, eval_dataloader)
-
-    # Print/save matrices
-    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy())
-    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy())
-
-    logger.info("Attention entropies")
-    print_2d_tensor(attn_entropy)
-    logger.info("Head importance scores")
-    print_2d_tensor(head_importance)
-    logger.info("Head ranked by importance scores")
-    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
-    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device)
-    head_ranks = head_ranks.view_as(head_importance)
-    print_2d_tensor(head_ranks)
-
-    # Do masking if we want to
-    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
-        _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
-        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-        original_score = compute_metrics(task_name, preds, labels)[args.metric_name]
-        logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)
-
-        new_head_mask = torch.ones_like(head_importance)
-        num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))
-
-        current_score = original_score
-        while current_score >= original_score * args.masking_threshold:
-            head_mask = new_head_mask.clone() # save current head mask
-            # heads from least important to most - keep only not-masked heads
-            head_importance[head_mask == 0.0] = float('Inf')
-            current_heads_to_mask = head_importance.view(-1).sort()[1]
-
-            if len(current_heads_to_mask) <= num_to_mask:
-                break
-
-            # mask heads
-            current_heads_to_mask = current_heads_to_mask[:num_to_mask]
-            logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
-            new_head_mask = new_head_mask.view(-1)
-            new_head_mask[current_heads_to_mask] = 0.0
-            new_head_mask = new_head_mask.view_as(head_mask)
-            print_2d_tensor(new_head_mask)
-
-            # Compute metric and head importance again
-            _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)
-            preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-            current_score = compute_metrics(task_name, preds, labels)[args.metric_name]
-            logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)
-
-        logger.info("Final head mask")
-        print_2d_tensor(head_mask)
-        np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy())
-
-        # Try pruning and test time speedup
-        # Pruning is like masking but we actually remove the masked weights
-        before_time = datetime.now()
-        _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
-                                                       compute_entropy=False, compute_importance=False, head_mask=head_mask)
-        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-        score_masking = compute_metrics(task_name, preds, labels)[args.metric_name]
-        original_time = datetime.now() - before_time
-
-        original_num_params = sum(p.numel() for p in model.parameters())
-        heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))
-        assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
-        model.bert.prune_heads(heads_to_prune)
-        pruned_num_params = sum(p.numel() for p in model.parameters())
-
-        before_time = datetime.now()
-        _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
-                                                       compute_entropy=False, compute_importance=False, head_mask=None)
-        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
-        score_pruning = compute_metrics(task_name, preds, labels)[args.metric_name]
-        new_time = datetime.now() - before_time
-
-        logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)
-        logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
-        logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
-
-if __name__ == '__main__':
-    run_model()
--- a/examples/extract_features.py
+++ b/examples/extract_features.py
@@ -1,297 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Extract pre-computed feature vectors from a PyTorch BERT model."""
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import argparse
-import collections
-import logging
-import json
-import re
-
-import torch
-from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
-from torch.utils.data.distributed import DistributedSampler
-
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.modeling import BertModel
-
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
-logger = logging.getLogger(__name__)
-
-
-class InputExample(object):
-
-    def __init__(self, unique_id, text_a, text_b):
-        self.unique_id = unique_id
-        self.text_a = text_a
-        self.text_b = text_b
-
-
-class InputFeatures(object):
-    """A single set of features of data."""
-
-    def __init__(self, unique_id, tokens, input_ids, input_mask, input_type_ids):
-        self.unique_id = unique_id
-        self.tokens = tokens
-        self.input_ids = input_ids
-        self.input_mask = input_mask
-        self.input_type_ids = input_type_ids
-
-
-def convert_examples_to_features(examples, seq_length, tokenizer):
-    """Loads a data file into a list of `InputFeature`s."""
-
-    features = []
-    for (ex_index, example) in enumerate(examples):
-        tokens_a = tokenizer.tokenize(example.text_a)
-
-        tokens_b = None
-        if example.text_b:
-            tokens_b = tokenizer.tokenize(example.text_b)
-
-        if tokens_b:
-            # Modifies `tokens_a` and `tokens_b` in place so that the total
-            # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            _truncate_seq_pair(tokens_a, tokens_b, seq_length - 3)
-        else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > seq_length - 2:
-                tokens_a = tokens_a[0:(seq_length - 2)]
-
-        # The convention in BERT is:
-        # (a) For sequence pairs:
-        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-        #  type_ids:   0   0  0    0    0     0      0   0    1  1  1   1  1   1
-        # (b) For single sequences:
-        #  tokens:   [CLS] the dog is hairy . [SEP]
-        #  type_ids:   0   0   0   0  0     0   0
-        #
-        # Where "type_ids" are used to indicate whether this is the first
-        # sequence or the second sequence. The embedding vectors for `type=0` and
-        # `type=1` were learned during pre-training and are added to the wordpiece
-        # embedding vector (and position vector). This is not *strictly* necessary
-        # since the [SEP] token unambigiously separates the sequences, but it makes
-        # it easier for the model to learn the concept of sequences.
-        #
-        # For classification tasks, the first vector (corresponding to [CLS]) is
-        # used as as the "sentence vector". Note that this only makes sense because
-        # the entire model is fine-tuned.
-        tokens = []
-        input_type_ids = []
-        tokens.append("[CLS]")
-        input_type_ids.append(0)
-        for token in tokens_a:
-            tokens.append(token)
-            input_type_ids.append(0)
-        tokens.append("[SEP]")
-        input_type_ids.append(0)
-
-        if tokens_b:
-            for token in tokens_b:
-                tokens.append(token)
-                input_type_ids.append(1)
-            tokens.append("[SEP]")
-            input_type_ids.append(1)
-
-        input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-        # The mask has 1 for real tokens and 0 for padding tokens. Only real
-        # tokens are attended to.
-        input_mask = [1] * len(input_ids)
-
-        # Zero-pad up to the sequence length.
-        while len(input_ids) < seq_length:
-            input_ids.append(0)
-            input_mask.append(0)
-            input_type_ids.append(0)
-
-        assert len(input_ids) == seq_length
-        assert len(input_mask) == seq_length
-        assert len(input_type_ids) == seq_length
-
-        if ex_index < 5:
-            logger.info("*** Example ***")
-            logger.info("unique_id: %s" % (example.unique_id))
-            logger.info("tokens: %s" % " ".join([str(x) for x in tokens]))
-            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            logger.info(
-                "input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
-
-        features.append(
-            InputFeatures(
-                unique_id=example.unique_id,
-                tokens=tokens,
-                input_ids=input_ids,
-                input_mask=input_mask,
-                input_type_ids=input_type_ids))
-    return features
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-    """Truncates a sequence pair in place to the maximum length."""
-
-    # This is a simple heuristic which will always truncate the longer sequence
-    # one token at a time. This makes more sense than truncating an equal percent
-    # of tokens from each, since if one sequence is very short then each token
-    # that's truncated likely contains more information than a longer sequence.
-    while True:
-        total_length = len(tokens_a) + len(tokens_b)
-        if total_length <= max_length:
-            break
-        if len(tokens_a) > len(tokens_b):
-            tokens_a.pop()
-        else:
-            tokens_b.pop()
-
-
-def read_examples(input_file):
-    """Read a list of `InputExample`s from an input file."""
-    examples = []
-    unique_id = 0
-    with open(input_file, "r", encoding='utf-8') as reader:
-        while True:
-            line = reader.readline()
-            if not line:
-                break
-            line = line.strip()
-            text_a = None
-            text_b = None
-            m = re.match(r"^(.*) \|\|\| (.*)$", line)
-            if m is None:
-                text_a = line
-            else:
-                text_a = m.group(1)
-                text_b = m.group(2)
-            examples.append(
-                InputExample(unique_id=unique_id, text_a=text_a, text_b=text_b))
-            unique_id += 1
-    return examples
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--input_file", default=None, type=str, required=True)
-    parser.add_argument("--output_file", default=None, type=str, required=True)
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
-
-    ## Other parameters
-    parser.add_argument("--do_lower_case", action='store_true', help="Set this flag if you are using an uncased model.")
-    parser.add_argument("--layers", default="-1,-2,-3,-4", type=str)
-    parser.add_argument("--max_seq_length", default=128, type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
-                            "than this will be truncated, and sequences shorter than this will be padded.")
-    parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help = "local_rank for distributed training on gpus")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-
-    args = parser.parse_args()
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
-    logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))
-
-    layer_indexes = [int(x) for x in args.layers.split(",")]
-
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-
-    examples = read_examples(args.input_file)
-
-    features = convert_examples_to_features(
-        examples=examples, seq_length=args.max_seq_length, tokenizer=tokenizer)
-
-    unique_id_to_feature = {}
-    for feature in features:
-        unique_id_to_feature[feature.unique_id] = feature
-
-    model = BertModel.from_pretrained(args.bert_model)
-    model.to(device)
-
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
-                                                          output_device=args.local_rank)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
-    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-
-    eval_data = TensorDataset(all_input_ids, all_input_mask, all_example_index)
-    if args.local_rank == -1:
-        eval_sampler = SequentialSampler(eval_data)
-    else:
-        eval_sampler = DistributedSampler(eval_data)
-    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
-
-    model.eval()
-    with open(args.output_file, "w", encoding='utf-8') as writer:
-        for input_ids, input_mask, example_indices in eval_dataloader:
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-
-            all_encoder_layers, _ = model(input_ids, token_type_ids=None, attention_mask=input_mask)
-            all_encoder_layers = all_encoder_layers
-
-            for b, example_index in enumerate(example_indices):
-                feature = features[example_index.item()]
-                unique_id = int(feature.unique_id)
-                # feature = unique_id_to_feature[unique_id]
-                output_json = collections.OrderedDict()
-                output_json["linex_index"] = unique_id
-                all_out_features = []
-                for (i, token) in enumerate(feature.tokens):
-                    all_layers = []
-                    for (j, layer_index) in enumerate(layer_indexes):
-                        layer_output = all_encoder_layers[int(layer_index)].detach().cpu().numpy()
-                        layer_output = layer_output[b]
-                        layers = collections.OrderedDict()
-                        layers["index"] = layer_index
-                        layers["values"] = [
-                            round(x.item(), 6) for x in layer_output[i]
-                        ]
-                        all_layers.append(layers)
-                    out_features = collections.OrderedDict()
-                    out_features["token"] = token
-                    out_features["layers"] = all_layers
-                    all_out_features.append(out_features)
-                output_json["features"] = all_out_features
-                writer.write(json.dumps(output_json) + "\n")
-
-
-if __name__ == "__main__":
-    main()
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -13,10 +13,10 @@ from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm

-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForPreTraining
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForPreTraining
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule

 InputFeatures = namedtuple("InputFeatures", "input_ids input_mask segment_ids lm_label_ids is_next")

@@ -273,7 +273,7 @@ def main():
        warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
                                             t_total=num_train_optimization_steps)
    else:
-        optimizer = BertAdam(optimizer_grouped_parameters,
+        optimizer = AdamW(optimizer_grouped_parameters,
                             lr=args.learning_rate,
                             warmup=args.warmup_proportion,
                             t_total=num_train_optimization_steps)
--- a/examples/lm_finetuning/pregenerate_training_data.py
+++ b/examples/lm_finetuning/pregenerate_training_data.py
@@ -6,7 +6,7 @@ import shelve
 from multiprocessing import Pool

 from random import random, randrange, randint, shuffle, choice
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_transformers.tokenization_bert import BertTokenizer
 import numpy as np
 import json
 import collections
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -29,10 +29,10 @@ from torch.utils.data import DataLoader, Dataset, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange

-from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForPreTraining
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForPreTraining
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.optimization import BertAdam, WarmupLinearSchedule

 logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python3
+# Copyright 2018 CMU and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Bertology: this script shows how you can explore the internals of the models in the library to:
+    - compute the entropy of the head attentions
+    - compute the importance of each head
+    - prune (remove) the low importance head.
+    Some parts of this script are adapted from the code of Michel et al. (http://arxiv.org/abs/1905.10650)
+    which is available at https://github.com/pmichel31415/are-16-heads-really-better-than-1
+"""
+import os
+import argparse
+import logging
+from datetime import timedelta, datetime
+from tqdm import tqdm
+
+import numpy as np
+
+import torch
+from torch.utils.data import DataLoader, SequentialSampler, TensorDataset, Subset
+from torch.utils.data.distributed import DistributedSampler
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from pytorch_transformers import (WEIGHTS_NAME,
+                                  BertConfig, BertForSequenceClassification, BertTokenizer,
+                                  XLMConfig, XLMForSequenceClassification, XLMTokenizer,
+                                  XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer)
+
+from run_glue import set_seed, load_and_cache_examples, ALL_MODELS, MODEL_CLASSES
+
+from utils_glue import (compute_metrics, convert_examples_to_features,
+                        output_modes, processors)
+
+logger = logging.getLogger(__name__)
+
+
+def entropy(p):
+    """ Compute the entropy of a probability distribution """
+    plogp = p * torch.log(p)
+    plogp[p == 0] = 0
+    return -plogp.sum(dim=-1)
+
+
+def print_2d_tensor(tensor):
+    """ Print a 2D tensor """
+    logger.info("lv, h >\t" + "\t".join(f"{x + 1}" for x in range(len(tensor))))
+    for row in range(len(tensor)):
+        if tensor.dtype != torch.long:
+            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:.5f}" for x in tensor[row].cpu().data))
+        else:
+            logger.info(f"layer {row + 1}:\t" + "\t".join(f"{x:d}" for x in tensor[row].cpu().data))
+
+
+def compute_heads_importance(args, model, eval_dataloader, compute_entropy=True, compute_importance=True, head_mask=None):
+    """ This method shows how to compute:
+        - head attention entropy
+        - head importance scores according to http://arxiv.org/abs/1905.10650
+    """
+    # Prepare our tensors
+    n_layers, n_heads = model.bert.config.num_hidden_layers, model.bert.config.num_attention_heads
+    head_importance = torch.zeros(n_layers, n_heads).to(args.device)
+    attn_entropy = torch.zeros(n_layers, n_heads).to(args.device)
+
+    if head_mask is None:
+        head_mask = torch.ones(n_layers, n_heads).to(args.device)
+    head_mask.requires_grad_(requires_grad=True)
+    preds = None
+    labels = None
+    tot_tokens = 0.0
+
+    for step, batch in enumerate(tqdm(eval_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
+        batch = tuple(t.to(args.device) for t in batch)
+        input_ids, input_mask, segment_ids, label_ids = batch
+
+        # Do a forward pass (not with torch.no_grad() since we need gradients for importance score - see below)
+        outputs = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=label_ids, head_mask=head_mask)
+        loss, logits, all_attentions = outputs[0], outputs[1], outputs[-1]  # Loss and logits are the first, attention the last
+        loss.backward()  # Backpropagate to populate the gradients in the head mask
+
+        if compute_entropy:
+            for layer, attn in enumerate(all_attentions):
+                masked_entropy = entropy(attn.detach()) * input_mask.float().unsqueeze(1)
+                attn_entropy[layer] += masked_entropy.sum(-1).sum(0).detach()
+
+        if compute_importance:
+            head_importance += head_mask.grad.abs().detach()
+
+        # Also store our logits/labels if we want to compute metrics afterwards
+        if preds is None:
+            preds = logits.detach().cpu().numpy()
+            labels = label_ids.detach().cpu().numpy()
+        else:
+            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+            labels = np.append(labels, label_ids.detach().cpu().numpy(), axis=0)
+
+        tot_tokens += input_mask.float().detach().sum().data
+
+    # Normalize
+    attn_entropy /= tot_tokens
+    head_importance /= tot_tokens
+    # Layerwise importance normalization
+    if not args.dont_normalize_importance_by_layer:
+        exponent = 2
+        norm_by_layer = torch.pow(torch.pow(head_importance, exponent).sum(-1), 1/exponent)
+        head_importance /= norm_by_layer.unsqueeze(-1) + 1e-20
+
+    if not args.dont_normalize_global_importance:
+        head_importance = (head_importance - head_importance.min()) / (head_importance.max() - head_importance.min())
+
+    # Print/save matrices
+    np.save(os.path.join(args.output_dir, 'attn_entropy.npy'), attn_entropy.detach().cpu().numpy())
+    np.save(os.path.join(args.output_dir, 'head_importance.npy'), head_importance.detach().cpu().numpy())
+
+    logger.info("Attention entropies")
+    print_2d_tensor(attn_entropy)
+    logger.info("Head importance scores")
+    print_2d_tensor(head_importance)
+    logger.info("Head ranked by importance scores")
+    head_ranks = torch.zeros(head_importance.numel(), dtype=torch.long, device=args.device)
+    head_ranks[head_importance.view(-1).sort(descending=True)[1]] = torch.arange(head_importance.numel(), device=args.device)
+    head_ranks = head_ranks.view_as(head_importance)
+    print_2d_tensor(head_ranks)
+
+    return attn_entropy, head_importance, preds, labels
+
+
+def mask_heads(args, model, eval_dataloader):
+    """ This method shows how to mask head (set some heads to zero), to test the effect on the network,
+        based on the head importance scores, as described in Michel et al. (http://arxiv.org/abs/1905.10650)
+    """
+    _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False)
+    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+    original_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    logger.info("Pruning: original score: %f, threshold: %f", original_score, original_score * args.masking_threshold)
+
+    new_head_mask = torch.ones_like(head_importance)
+    num_to_mask = max(1, int(new_head_mask.numel() * args.masking_amount))
+
+    current_score = original_score
+    while current_score >= original_score * args.masking_threshold:
+        head_mask = new_head_mask.clone() # save current head mask
+        # heads from least important to most - keep only not-masked heads
+        head_importance[head_mask == 0.0] = float('Inf')
+        current_heads_to_mask = head_importance.view(-1).sort()[1]
+
+        if len(current_heads_to_mask) <= num_to_mask:
+            break
+
+        # mask heads
+        current_heads_to_mask = current_heads_to_mask[:num_to_mask]
+        logger.info("Heads to mask: %s", str(current_heads_to_mask.tolist()))
+        new_head_mask = new_head_mask.view(-1)
+        new_head_mask[current_heads_to_mask] = 0.0
+        new_head_mask = new_head_mask.view_as(head_mask)
+        print_2d_tensor(new_head_mask)
+
+        # Compute metric and head importance again
+        _, head_importance, preds, labels = compute_heads_importance(args, model, eval_dataloader, compute_entropy=False, head_mask=new_head_mask)
+        preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+        current_score = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+        logger.info("Masking: current score: %f, remaning heads %d (%.1f percents)", current_score, new_head_mask.sum(), new_head_mask.sum()/new_head_mask.numel() * 100)
+
+    logger.info("Final head mask")
+    print_2d_tensor(head_mask)
+    np.save(os.path.join(args.output_dir, 'head_mask.npy'), head_mask.detach().cpu().numpy())
+
+    return head_mask
+
+
+def prune_heads(args, model, eval_dataloader, head_mask):
+    """ This method shows how to prune head (remove heads weights) based on
+        the head importance scores as described in Michel et al. (http://arxiv.org/abs/1905.10650)
+    """
+    # Try pruning and test time speedup
+    # Pruning is like masking but we actually remove the masked weights
+    before_time = datetime.now()
+    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
+                                                   compute_entropy=False, compute_importance=False, head_mask=head_mask)
+    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+    score_masking = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    original_time = datetime.now() - before_time
+
+    original_num_params = sum(p.numel() for p in model.parameters())
+    heads_to_prune = dict((layer, (1 - head_mask[layer].long()).nonzero().tolist()) for layer in range(len(head_mask)))
+    assert sum(len(h) for h in heads_to_prune.values()) == (1 - head_mask.long()).sum().item()
+    model.prune_heads(heads_to_prune)
+    pruned_num_params = sum(p.numel() for p in model.parameters())
+
+    before_time = datetime.now()
+    _, _, preds, labels = compute_heads_importance(args, model, eval_dataloader,
+                                                    compute_entropy=False, compute_importance=False, head_mask=None)
+    preds = np.argmax(preds, axis=1) if args.output_mode == "classification" else np.squeeze(preds)
+    score_pruning = compute_metrics(args.task_name, preds, labels)[args.metric_name]
+    new_time = datetime.now() - before_time
+
+    logger.info("Pruning: original num of params: %.2e, after pruning %.2e (%.1f percents)", original_num_params, pruned_num_params, pruned_num_params/original_num_params * 100)
+    logger.info("Pruning: score with masking: %f score with pruning: %f", score_masking, score_pruning)
+    logger.info("Pruning: speed ratio (new timing / original timing): %f percents", original_time/new_time * 100)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--model_name", default=None, type=str, required=True,
+                        help="Bert/XLNet/XLM pre-trained model selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--task_name", default=None, type=str, required=True,
+                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--data_subset", type=int, default=-1,
+                        help="If > 0: limit the data to a subset of data_subset instances.")
+    parser.add_argument("--overwrite_output_dir", action='store_true',
+                        help="Whether to overwrite data in output directory")
+
+    parser.add_argument("--dont_normalize_importance_by_layer", action='store_true',
+                        help="Don't normalize importance score by layers")
+    parser.add_argument("--dont_normalize_global_importance", action='store_true',
+                        help="Don't normalize all importance scores between 0 and 1")
+
+    parser.add_argument("--try_masking", action='store_true',
+                        help="Whether to try to mask head until a threshold of accuracy.")
+    parser.add_argument("--masking_threshold", default=0.9, type=float,
+                        help="masking threshold in term of metrics (stop masking when metric < threshold * original metric value).")
+    parser.add_argument("--masking_amount", default=0.1, type=float,
+                        help="Amount to heads to masking at each masking step.")
+    parser.add_argument("--metric_name", default="acc", type=str,
+                        help="Metric to use for head masking.")
+
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. \n"
+                             "Sequences longer than this will be truncated, sequences shorter padded.")
+    parser.add_argument("--batch_size", default=1, type=int, help="Batch size.")
+
+    parser.add_argument("--seed", type=int, default=42)
+    parser.add_argument("--local_rank", type=int, default=-1, help="local_rank for distributed training on gpus")
+    parser.add_argument("--no_cuda", action='store_true', help="Whether not to use CUDA when available")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup devices and distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:
+        torch.cuda.set_device(args.local_rank)
+        args.device = torch.device("cuda", args.local_rank)
+        args.n_gpu = 1
+        torch.distributed.init_process_group(backend='nccl')  # Initializes the distributed backend
+
+    # Setup logging
+    logging.basicConfig(level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.info("device: {} n_gpu: {}, distributed: {}".format(args.device, args.n_gpu, bool(args.local_rank != -1)))
+
+    # Set seeds
+    set_seed(args)
+
+    # Prepare GLUE task
+    args.task_name = args.task_name.lower()
+    if args.task_name not in processors:
+        raise ValueError("Task not found: %s" % (args.task_name))
+    processor = processors[args.task_name]()
+    args.output_mode = output_modes[args.task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = ""
+    for key in MODEL_CLASSES:
+        if key in args.model_name.lower():
+            args.model_type = key  # take the first match in model types
+            break
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name,
+                                          num_labels=num_labels, finetuning_task=args.task_name,
+                                          output_attentions=True)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name)
+    model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    # Distributed and parallel training
+    model.to(args.device)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+    elif args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Print/save training arguments
+    torch.save(args, os.path.join(args.output_dir, 'run_args.bin'))
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Prepare dataset for the GLUE task
+    eval_data = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=True)
+    if args.data_subset > 0:
+        eval_data = Subset(eval_data, list(range(min(args.data_subset, len(eval_data)))))
+    eval_sampler = SequentialSampler(eval_data) if args.local_rank == -1 else DistributedSampler(eval_data)
+    eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.batch_size)
+
+
+    # Compute head entropy and importance score
+    compute_heads_importance(args, model, eval_dataloader)
+
+
+    # Try head masking (set heads to zero until the score goes under a threshole)
+    # and head pruning (remove masked heads and see the effect on the network)
+    if args.try_masking and args.masking_threshold > 0.0 and args.masking_threshold < 1.0:
+        head_mask = mask_heads(args, model, eval_dataloader)
+        prune_heads(args, model, eval_dataloader, head_mask)
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/run_classifier.py
+++ b/examples/run_classifier.py
@@ -1,541 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""BERT finetuning runner."""
-
-from __future__ import absolute_import, division, print_function
-
-import argparse
-import logging
-import os
-import sys
-import random
-from tqdm import tqdm, trange
-
-import numpy as np
-
-import torch
-from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
-                              TensorDataset)
-from torch.utils.data.distributed import DistributedSampler
-from torch.nn import CrossEntropyLoss, MSELoss
-
-from tensorboardX import SummaryWriter
-
-from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForSequenceClassification
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-
-from run_classifier_dataset_utils import processors, output_modes, convert_examples_to_features, compute_metrics
-
-if sys.version_info[0] == 2:
-    import cPickle as pickle
-else:
-    import pickle
-
-
-logger = logging.getLogger(__name__)
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    ## Required parameters
-    parser.add_argument("--data_dir",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
-                        "bert-base-multilingual-cased, bert-base-chinese.")
-    parser.add_argument("--task_name",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The name of the task to train.")
-    parser.add_argument("--output_dir",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
-
-    ## Other parameters
-    parser.add_argument("--cache_dir",
-                        default="",
-                        type=str,
-                        help="Where do you want to store the pre-trained models downloaded from s3")
-    parser.add_argument("--max_seq_length",
-                        default=128,
-                        type=int,
-                        help="The maximum total input sequence length after WordPiece tokenization. \n"
-                             "Sequences longer than this will be truncated, and sequences shorter \n"
-                             "than this will be padded.")
-    parser.add_argument("--do_train",
-                        action='store_true',
-                        help="Whether to run training.")
-    parser.add_argument("--do_eval",
-                        action='store_true',
-                        help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_lower_case",
-                        action='store_true',
-                        help="Set this flag if you are using an uncased model.")
-    parser.add_argument("--train_batch_size",
-                        default=32,
-                        type=int,
-                        help="Total batch size for training.")
-    parser.add_argument("--eval_batch_size",
-                        default=8,
-                        type=int,
-                        help="Total batch size for eval.")
-    parser.add_argument("--learning_rate",
-                        default=5e-5,
-                        type=float,
-                        help="The initial learning rate for Adam.")
-    parser.add_argument("--num_train_epochs",
-                        default=3.0,
-                        type=float,
-                        help="Total number of training epochs to perform.")
-    parser.add_argument("--warmup_proportion",
-                        default=0.1,
-                        type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. "
-                             "E.g., 0.1 = 10%% of training.")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
-                        help="Whether not to use CUDA when available")
-    parser.add_argument('--overwrite_output_dir',
-                        action='store_true',
-                        help="Overwrite the content of the output directory")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
-                        help="random seed for initialization")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument('--fp16',
-                        action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--loss_scale',
-                        type=float, default=0,
-                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                             "0 (default value): dynamic loss scaling.\n"
-                             "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
-    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
-    args = parser.parse_args()
-
-    if args.server_ip and args.server_port:
-        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
-        import ptvsd
-        print("Waiting for debugger attach")
-        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
-        ptvsd.wait_for_attach()
-
-    if args.local_rank == -1 or args.no_cuda:
-        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
-        torch.cuda.set_device(args.local_rank)
-        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
-        torch.distributed.init_process_group(backend='nccl')
-    args.device = device
-
-    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                        datefmt = '%m/%d/%Y %H:%M:%S',
-                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
-
-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, n_gpu, bool(args.local_rank != -1), args.fp16))
-
-    if args.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                            args.gradient_accumulation_steps))
-
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    if not args.do_train and not args.do_eval:
-        raise ValueError("At least one of `do_train` or `do_eval` must be True.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
-        os.makedirs(args.output_dir)
-
-    task_name = args.task_name.lower()
-
-    if task_name not in processors:
-        raise ValueError("Task not found: %s" % (task_name))
-
-    processor = processors[task_name]()
-    output_mode = output_modes[task_name]
-
-    label_list = processor.get_labels()
-    num_labels = len(label_list)
-
-    if args.local_rank not in [-1, 0]:
-        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-    model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
-    if args.local_rank == 0:
-        torch.distributed.barrier()
-
-    if args.fp16:
-        model.half()
-    model.to(device)
-    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model,
-                                                          device_ids=[args.local_rank],
-                                                          output_device=args.local_rank,
-                                                          find_unused_parameters=True)
-    elif n_gpu > 1:
-        model = torch.nn.DataParallel(model)
-
-    global_step = 0
-    nb_tr_steps = 0
-    tr_loss = 0
-
-    if args.do_train:
-        if args.local_rank in [-1, 0]:
-            tb_writer = SummaryWriter()
-
-        # Prepare data loader
-        train_examples = processor.get_train_examples(args.data_dir)
-        cached_train_features_file = os.path.join(args.data_dir, 'train_{0}_{1}_{2}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(),
-                        str(args.max_seq_length),
-                        str(task_name)))
-        try:
-            with open(cached_train_features_file, "rb") as reader:
-                train_features = pickle.load(reader)
-        except:
-            train_features = convert_examples_to_features(
-                train_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving train features into cached file %s", cached_train_features_file)
-                with open(cached_train_features_file, "wb") as writer:
-                    pickle.dump(train_features, writer)
-
-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-
-        if output_mode == "classification":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.long)
-        elif output_mode == "regression":
-            all_label_ids = torch.tensor([f.label_id for f in train_features], dtype=torch.float)
-
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-
-        # Prepare optimizer
-
-        param_optimizer = list(model.named_parameters())
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                                 t_total=num_train_optimization_steps)
-
-        else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=num_train_optimization_steps)
-
-        logger.info("***** Running training *****")
-        logger.info("  Num examples = %d", len(train_examples))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
-
-        model.train()
-        for _ in trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]):
-            tr_loss = 0
-            nb_tr_examples, nb_tr_steps = 0, 0
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-                batch = tuple(t.to(device) for t in batch)
-                input_ids, input_mask, segment_ids, label_ids = batch
-
-                # define a new function to compute loss values for both output_modes
-                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
-
-                if output_mode == "classification":
-                    loss_fct = CrossEntropyLoss()
-                    loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-                elif output_mode == "regression":
-                    loss_fct = MSELoss()
-                    loss = loss_fct(logits.view(-1), label_ids.view(-1))
-
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-
-                tr_loss += loss.item()
-                nb_tr_examples += input_ids.size(0)
-                nb_tr_steps += 1
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16:
-                        # modify learning rate with special warm up BERT uses
-                        # if args.fp16 is False, BertAdam is used that handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
-                        for param_group in optimizer.param_groups:
-                            param_group['lr'] = lr_this_step
-                    optimizer.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-                    if args.local_rank in [-1, 0]:
-                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                        tb_writer.add_scalar('loss', loss.item(), global_step)
-
-    ### Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
-    ### Example:
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(args.output_dir)
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = BertForSequenceClassification.from_pretrained(args.output_dir, num_labels=num_labels)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-
-        # Good practice: save your training arguments together with the trained model
-        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
-        torch.save(args, output_args_file)
-    else:
-        model = BertForSequenceClassification.from_pretrained(args.bert_model, num_labels=num_labels)
-
-    model.to(device)
-
-    ### Evaluation
-    if args.do_eval and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        eval_examples = processor.get_dev_examples(args.data_dir)
-        cached_eval_features_file = os.path.join(args.data_dir, 'dev_{0}_{1}_{2}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(),
-                        str(args.max_seq_length),
-                        str(task_name)))
-        try:
-            with open(cached_eval_features_file, "rb") as reader:
-                eval_features = pickle.load(reader)
-        except:
-            eval_features = convert_examples_to_features(
-                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-            if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving eval features into cached file %s", cached_eval_features_file)
-                with open(cached_eval_features_file, "wb") as writer:
-                    pickle.dump(eval_features, writer)
-
-
-        logger.info("***** Running evaluation *****")
-        logger.info("  Num examples = %d", len(eval_examples))
-        logger.info("  Batch size = %d", args.eval_batch_size)
-        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-
-        if output_mode == "classification":
-            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-        elif output_mode == "regression":
-            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.float)
-
-        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-        # Run prediction for full data
-        if args.local_rank == -1:
-            eval_sampler = SequentialSampler(eval_data)
-        else:
-            eval_sampler = DistributedSampler(eval_data)  # Note that this sampler samples randomly
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-        model.eval()
-        eval_loss = 0
-        nb_eval_steps = 0
-        preds = []
-        out_label_ids = None
-
-        for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-            segment_ids = segment_ids.to(device)
-            label_ids = label_ids.to(device)
-
-            with torch.no_grad():
-                logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask)
-
-            # create eval loss and other metric required by the task
-            if output_mode == "classification":
-                loss_fct = CrossEntropyLoss()
-                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-            elif output_mode == "regression":
-                loss_fct = MSELoss()
-                tmp_eval_loss = loss_fct(logits.view(-1), label_ids.view(-1))
-
-            eval_loss += tmp_eval_loss.mean().item()
-            nb_eval_steps += 1
-            if len(preds) == 0:
-                preds.append(logits.detach().cpu().numpy())
-                out_label_ids = label_ids.detach().cpu().numpy()
-            else:
-                preds[0] = np.append(
-                    preds[0], logits.detach().cpu().numpy(), axis=0)
-                out_label_ids = np.append(
-                    out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
-
-        eval_loss = eval_loss / nb_eval_steps
-        preds = preds[0]
-        if output_mode == "classification":
-            preds = np.argmax(preds, axis=1)
-        elif output_mode == "regression":
-            preds = np.squeeze(preds)
-        result = compute_metrics(task_name, preds, out_label_ids)
-
-        loss = tr_loss/global_step if args.do_train else None
-
-        result['eval_loss'] = eval_loss
-        result['global_step'] = global_step
-        result['loss'] = loss
-
-        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
-        with open(output_eval_file, "w") as writer:
-            logger.info("***** Eval results *****")
-            for key in sorted(result.keys()):
-                logger.info("  %s = %s", key, str(result[key]))
-                writer.write("%s = %s\n" % (key, str(result[key])))
-
-        # hack for MNLI-MM
-        if task_name == "mnli":
-            task_name = "mnli-mm"
-            processor = processors[task_name]()
-
-            if os.path.exists(args.output_dir + '-MM') and os.listdir(args.output_dir + '-MM') and args.do_train:
-                raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-            if not os.path.exists(args.output_dir + '-MM'):
-                os.makedirs(args.output_dir + '-MM')
-
-            eval_examples = processor.get_dev_examples(args.data_dir)
-            eval_features = convert_examples_to_features(
-                eval_examples, label_list, args.max_seq_length, tokenizer, output_mode)
-            logger.info("***** Running evaluation *****")
-            logger.info("  Num examples = %d", len(eval_examples))
-            logger.info("  Batch size = %d", args.eval_batch_size)
-            all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-            all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-            all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-            all_label_ids = torch.tensor([f.label_id for f in eval_features], dtype=torch.long)
-
-            eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
-            # Run prediction for full data
-            eval_sampler = SequentialSampler(eval_data)
-            eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.eval_batch_size)
-
-            model.eval()
-            eval_loss = 0
-            nb_eval_steps = 0
-            preds = []
-            out_label_ids = None
-
-            for input_ids, input_mask, segment_ids, label_ids in tqdm(eval_dataloader, desc="Evaluating"):
-                input_ids = input_ids.to(device)
-                input_mask = input_mask.to(device)
-                segment_ids = segment_ids.to(device)
-                label_ids = label_ids.to(device)
-
-                with torch.no_grad():
-                    logits = model(input_ids, token_type_ids=segment_ids, attention_mask=input_mask, labels=None)
-
-                loss_fct = CrossEntropyLoss()
-                tmp_eval_loss = loss_fct(logits.view(-1, num_labels), label_ids.view(-1))
-
-                eval_loss += tmp_eval_loss.mean().item()
-                nb_eval_steps += 1
-                if len(preds) == 0:
-                    preds.append(logits.detach().cpu().numpy())
-                    out_label_ids = label_ids.detach().cpu().numpy()
-                else:
-                    preds[0] = np.append(
-                        preds[0], logits.detach().cpu().numpy(), axis=0)
-                    out_label_ids = np.append(
-                        out_label_ids, label_ids.detach().cpu().numpy(), axis=0)
-
-            eval_loss = eval_loss / nb_eval_steps
-            preds = preds[0]
-            preds = np.argmax(preds, axis=1)
-            result = compute_metrics(task_name, preds, out_label_ids)
-
-            loss = tr_loss/global_step if args.do_train else None
-
-            result['eval_loss'] = eval_loss
-            result['global_step'] = global_step
-            result['loss'] = loss
-
-            output_eval_file = os.path.join(args.output_dir + '-MM', "eval_results.txt")
-            with open(output_eval_file, "w") as writer:
-                logger.info("***** Eval results *****")
-                for key in sorted(result.keys()):
-                    logger.info("  %s = %s", key, str(result[key]))
-                    writer.write("%s = %s\n" % (key, str(result[key])))
-
-if __name__ == "__main__":
-    main()
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -0,0 +1,195 @@
+#!/usr/bin/env python3
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conditional text generation with the auto-regressive models of the library (GPT/GPT-2/Transformer-XL/XLNet)
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import argparse
+import logging
+from tqdm import trange
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+
+from pytorch_transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
+
+from pytorch_transformers import GPT2LMHeadModel, GPT2Tokenizer
+from pytorch_transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
+from pytorch_transformers import XLNetLMHeadModel, XLNetTokenizer
+from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
+
+
+logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                    datefmt = '%m/%d/%Y %H:%M:%S',
+                    level = logging.INFO)
+logger = logging.getLogger(__name__)
+
+MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
+
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())
+
+MODEL_CLASSES = {
+    'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
+    'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
+    'xlnet': (XLNetLMHeadModel, XLNetTokenizer),
+    'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
+}
+
+# Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
+# in https://github.com/rusiaaman/XLNet-gen#methodology
+# and https://medium.com/@amanrusia/xlnet-speaks-comparison-to-gpt-2-ea1a4e9ba39e
+PADDING_TEXT = """ In 1991, the remains of Russian Tsar Nicholas II and his family
+(except for Alexei and Maria) are discovered.
+The voice of Nicholas's young son, Tsarevich Alexei Nikolaevich, narrates the
+remainder of the story. 1883 Western Siberia,
+a young Grigori Rasputin is asked by his father and a group of men to perform magic.
+Rasputin has a vision and denounces one of the men as a horse thief. Although his
+father initially slaps him for making such an accusation, Rasputin watches as the
+man is chased outside and beaten. Twenty years later, Rasputin sees a vision of
+the Virgin Mary, prompting him to become a priest. Rasputin quickly becomes famous,
+with people, even a bishop, begging for his blessing. <eod> </s> <eos>"""
+
+
+def set_seed(args):
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
+    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+        Args:
+            logits: logits distribution shape (vocabulary size)
+            top_k > 0: keep only top k tokens with highest probability (top-k filtering).
+            top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
+                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
+        From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
+    """
+    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
+    top_k = min(top_k, logits.size(-1))  # Safety check
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        logits[indices_to_remove] = filter_value
+    return logits
+
+
+def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, is_xlnet=False, device='cpu'):
+    context = torch.tensor(context, dtype=torch.long, device=device)
+    context = context.unsqueeze(0).repeat(num_samples, 1)
+    generated = context
+    with torch.no_grad():
+        for _ in trange(length):
+
+            inputs = {'input_ids': generated}
+            if is_xlnet: 
+                # XLNet is a direct (predict same token, not next token) and bi-directional model by default
+                # => need one additional dummy token in the input (will be masked), attention mask and target mapping (see model docstring)
+                input_ids = torch.cat((generated, torch.zeros((1, 1), dtype=torch.long, device=device)), dim=1)
+                perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float, device=device)
+                perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+                target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float, device=device)
+                target_mapping[0, 0, -1] = 1.0  # predict last token
+                inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}
+
+            outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
+            next_token_logits = outputs[0][0, -1, :] / temperature
+            filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
+            next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
+            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
+    return generated
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--prompt", type=str, default="")
+    parser.add_argument("--padding_text", type=str, default="")
+    parser.add_argument("--length", type=int, default=20)
+    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--top_k", type=int, default=0)
+    parser.add_argument("--top_p", type=float, default=0.9)
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+    args = parser.parse_args()
+
+    args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+    args.n_gpu = torch.cuda.device_count()
+
+    set_seed(args)
+
+    args.model_type = args.model_type.lower()
+    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+    model = model_class.from_pretrained(args.model_name_or_path)
+    model.to(args.device)
+    model.eval()
+
+    if args.length < 0 and model.config.max_position_embeddings > 0:
+        args.length = model.config.max_position_embeddings
+    elif 0 < model.config.max_position_embeddings < args.length:
+        args.length = model.config.max_position_embeddings  # No generation bigger than model size 
+    elif args.length < 0:
+        args.length = MAX_LENGTH  # avoid infinite loop
+
+    print(args)
+    while True:
+        raw_text = args.prompt if args.prompt else input("Model prompt >>> ")
+        if args.model_type in ["transfo-xl", "xlnet"]:
+            # Models with memory likes to have a long prompt for short inputs.
+            raw_text = (args.padding_text if args.padding_text else PADDING_TEXT) + raw_text
+        context_tokens = tokenizer.encode(raw_text)
+        out = sample_sequence(
+            model=model,
+            context=context_tokens,
+            length=args.length,
+            temperature=args.temperature,
+            top_k=args.top_k,
+            top_p=args.top_p,
+            device=args.device,
+            is_xlnet=bool(args.model_type == "xlnet"),
+        )
+        out = out[0, len(context_tokens):].tolist()
+        text = tokenizer.decode(out, clean_up_tokenization_spaces=True)
+        print(text)
+        if args.prompt:
+            break
+    return text
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -0,0 +1,475 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet)."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import glob
+import logging
+import os
+import random
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+from tensorboardX import SummaryWriter
+from tqdm import tqdm, trange
+
+from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+                                  BertForSequenceClassification, BertTokenizer,
+                                  XLMConfig, XLMForSequenceClassification,
+                                  XLMTokenizer, XLNetConfig,
+                                  XLNetForSequenceClassification,
+                                  XLNetTokenizer)
+
+from pytorch_transformers import AdamW, WarmupLinearSchedule
+
+from utils_glue import (compute_metrics, convert_examples_to_features,
+                        output_modes, processors)
+
+logger = logging.getLogger(__name__)
+
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+
+MODEL_CLASSES = {
+    'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
+    'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
+    'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+}
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def train(args, train_dataset, model, tokenizer):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {'input_ids':      batch[0],
+                      'attention_mask': batch[1],
+                      'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
+                      'labels':         batch[3]}
+            ouputs = model(**inputs)
+            loss = ouputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+            else:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                scheduler.step()  # Update learning rate schedule
+                optimizer.step()
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    # Loop to handle MNLI double evaluation (matched, mis-matched)
+    eval_task_names = ("mnli", "mnli-mm") if args.task_name == "mnli" else (args.task_name,)
+    eval_outputs_dirs = (args.output_dir, args.output_dir + '-MM') if args.task_name == "mnli" else (args.output_dir,)
+
+    results = {}
+    for eval_task, eval_output_dir in zip(eval_task_names, eval_outputs_dirs):
+        eval_dataset = load_and_cache_examples(args, eval_task, tokenizer, evaluate=True)
+
+        if not os.path.exists(eval_output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(eval_output_dir)
+
+        args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+        # Note that DistributedSampler samples randomly
+        eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+        eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+        # Eval!
+        logger.info("***** Running evaluation {} *****".format(prefix))
+        logger.info("  Num examples = %d", len(eval_dataset))
+        logger.info("  Batch size = %d", args.eval_batch_size)
+        eval_loss = 0.0
+        nb_eval_steps = 0
+        preds = None
+        out_label_ids = None
+        for batch in tqdm(eval_dataloader, desc="Evaluating"):
+            model.eval()
+            batch = tuple(t.to(args.device) for t in batch)
+
+            with torch.no_grad():
+                inputs = {'input_ids':      batch[0],
+                          'attention_mask': batch[1],
+                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
+                          'labels':         batch[3]}
+                outputs = model(**inputs)
+                tmp_eval_loss, logits = outputs[:2]
+
+                eval_loss += tmp_eval_loss.mean().item()
+            nb_eval_steps += 1
+            if preds is None:
+                preds = logits.detach().cpu().numpy()
+                out_label_ids = inputs['labels'].detach().cpu().numpy()
+            else:
+                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+                out_label_ids = np.append(out_label_ids, inputs['labels'].detach().cpu().numpy(), axis=0)
+
+        eval_loss = eval_loss / nb_eval_steps
+        if args.output_mode == "classification":
+            preds = np.argmax(preds, axis=1)
+        elif args.output_mode == "regression":
+            preds = np.squeeze(preds)
+        result = compute_metrics(eval_task, preds, out_label_ids)
+        results.update(result)
+
+        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            logger.info("***** Eval results {} *****".format(prefix))
+            for key in sorted(result.keys()):
+                logger.info("  %s = %s", key, str(result[key]))
+                writer.write("%s = %s\n" % (key, str(result[key])))
+
+    return results
+
+
+def load_and_cache_examples(args, task, tokenizer, evaluate=False):
+    processor = processors[task]()
+    output_mode = output_modes[task]
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(args.data_dir, 'cached_{}_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        str(args.max_seq_length),
+        str(task)))
+    if os.path.exists(cached_features_file):
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        label_list = processor.get_labels()
+        examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
+            cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
+            cls_token=tokenizer.cls_token,
+            sep_token=tokenizer.sep_token,
+            cls_token_segment_id=2 if args.model_type in ['xlnet'] else 1,
+            pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
+            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    if output_mode == "classification":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.long)
+    elif output_mode == "regression":
+        all_label_ids = torch.tensor([f.label_id for f in features], dtype=torch.float)
+
+    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--task_name", default=None, type=str, required=True,
+                        help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
+
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Avoid using CUDA when available")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="For distributed training: local_rank")
+    parser.add_argument('--server_ip', type=str, default='', help="For distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="For distant debugging.")
+    args = parser.parse_args()
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+
+    # Set seed
+    set_seed(args)
+
+    # Prepare GLUE task
+    args.task_name = args.task_name.lower()
+    if args.task_name not in processors:
+        raise ValueError("Task not found: %s" % (args.task_name))
+    processor = processors[args.task_name]()
+    args.output_mode = output_modes[args.task_name]
+    label_list = processor.get_labels()
+    num_labels = len(label_list)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path, num_labels=num_labels, finetuning_task=args.task_name)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    # Distributed and parallel training
+    model.to(args.device)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+    elif args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, args.task_name, tokenizer, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+
+
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, prefix=global_step)
+            result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
+            results.update(result)
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
--- a/examples/run_gpt2.py
+++ b/examples/run_gpt2.py
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import logging
-from tqdm import trange
-
-import torch
-import torch.nn.functional as F
-import numpy as np
-
-from pytorch_pretrained_bert import GPT2LMHeadModel, GPT2Tokenizer
-
-logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
-                    datefmt = '%m/%d/%Y %H:%M:%S',
-                    level = logging.INFO)
-logger = logging.getLogger(__name__)
-
-def top_k_logits(logits, k):
-    """
-    Masks everything but the k top entries as -infinity (1e10).
-    Used to mask logits such that e^-infinity -> 0 won't contribute to the
-    sum of the denominator.
-    """
-    if k == 0:
-        return logits
-    else:
-        values = torch.topk(logits, k)[0]
-        batch_mins = values[:, -1].view(-1, 1).expand_as(logits)
-        return torch.where(logits < batch_mins, torch.ones_like(logits) * -1e10, logits)
-
-def sample_sequence(model, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, device='cuda', sample=True):
-    if start_token is None:
-        assert context is not None, 'Specify exactly one of start_token and context!'
-        context = torch.tensor(context, device=device, dtype=torch.long).unsqueeze(0).repeat(batch_size, 1)
-    else:
-        assert context is None, 'Specify exactly one of start_token and context!'
-        context = torch.full((batch_size, 1), start_token, device=device, dtype=torch.long)
-    prev = context
-    output = context
-    past = None
-    with torch.no_grad():
-        for i in trange(length):
-            logits, past = model(prev, past=past)
-            logits = logits[:, -1, :] / temperature
-            logits = top_k_logits(logits, k=top_k)
-            log_probs = F.softmax(logits, dim=-1)
-            if sample:
-                prev = torch.multinomial(log_probs, num_samples=1)
-            else:
-                _, prev = torch.topk(log_probs, k=1, dim=-1)
-            output = torch.cat((output, prev), dim=1)
-    return output
-
-def run_model():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model_name_or_path', type=str, default='gpt2', help='pretrained model name or path to local checkpoint')
-    parser.add_argument("--seed", type=int, default=0)
-    parser.add_argument("--nsamples", type=int, default=1)
-    parser.add_argument("--batch_size", type=int, default=-1)
-    parser.add_argument("--length", type=int, default=-1)
-    parser.add_argument("--temperature", type=float, default=1.0)
-    parser.add_argument("--top_k", type=int, default=0)
-    parser.add_argument('--unconditional', action='store_true', help='If true, unconditional generation.')
-    args = parser.parse_args()
-    print(args)
-
-    if args.batch_size == -1:
-        args.batch_size = 1
-    assert args.nsamples % args.batch_size == 0
-
-    np.random.seed(args.seed)
-    torch.random.manual_seed(args.seed)
-    torch.cuda.manual_seed(args.seed)
-    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-
-    enc = GPT2Tokenizer.from_pretrained(args.model_name_or_path)
-    model = GPT2LMHeadModel.from_pretrained(args.model_name_or_path)
-    model.to(device)
-    model.eval()
-
-    if args.length == -1:
-        args.length = model.config.n_ctx // 2
-    elif args.length > model.config.n_ctx:
-        raise ValueError("Can't get samples longer than window size: %s" % model.config.n_ctx)
-
-    while True:
-        context_tokens = []
-        if not args.unconditional:
-            raw_text = input("Model prompt >>> ")
-            while not raw_text:
-                print('Prompt should not be empty!')
-                raw_text = input("Model prompt >>> ")
-            context_tokens = enc.encode(raw_text)
-            generated = 0
-            for _ in range(args.nsamples // args.batch_size):
-                out = sample_sequence(
-                    model=model, length=args.length,
-                    context=context_tokens,
-                    start_token=None,
-                    batch_size=args.batch_size,
-                    temperature=args.temperature, top_k=args.top_k, device=device
-                )
-                out = out[:, len(context_tokens):].tolist()
-                for i in range(args.batch_size):
-                    generated += 1
-                    text = enc.decode(out[i])
-                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
-                    print(text)
-            print("=" * 80)
-        else:
-            generated = 0
-            for _ in range(args.nsamples // args.batch_size):
-                out = sample_sequence(
-                    model=model, length=args.length,
-                    context=None,
-                    start_token=enc.encoder['<|endoftext|>'],
-                    batch_size=args.batch_size,
-                    temperature=args.temperature, top_k=args.top_k, device=device
-                )
-                out = out[:,1:].tolist()
-                for i in range(args.batch_size):
-                    generated += 1
-                    text = enc.decode(out[i])
-                    print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
-                    print(text)
-            print("=" * 80)
-
-if __name__ == '__main__':
-    run_model()
-
-
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Run BERT on SQuAD."""
+""" Finetuning the library models for question-answering on SQuAD (Bert, XLM, XLNet)."""

 from __future__ import absolute_import, division, print_function

@@ -21,8 +21,7 @@ import argparse
 import logging
 import os
 import random
-import sys
-from io import open
+import glob

 import numpy as np
 import torch
@@ -33,36 +32,306 @@ from tqdm import tqdm, trange

 from tensorboardX import SummaryWriter

-from pytorch_pretrained_bert.file_utils import WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForQuestionAnswering
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
+                                  BertForQuestionAnswering, BertTokenizer,
+                                  XLMConfig, XLMForQuestionAnswering,
+                                  XLMTokenizer, XLNetConfig,
+                                  XLNetForQuestionAnswering,
+                                  XLNetTokenizer)

-from run_squad_dataset_utils import read_squad_examples, convert_examples_to_features, RawResult, write_predictions
+from pytorch_transformers import AdamW, WarmupLinearSchedule

-if sys.version_info[0] == 2:
-    import cPickle as pickle
-else:
-    import pickle
+from utils_squad import (read_squad_examples, convert_examples_to_features,
+                         RawResult, write_predictions,
+                         RawResultExtended, write_predictions_extended)
+
+# The follwing import is the official SQuAD evaluation script (2.0).
+# You can remove it from the dependencies if you are using this script outside of the library
+# We've added it here for automated tests (see examples/test_examples.py file)
+from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad

 logger = logging.getLogger(__name__)

+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
+                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+
+MODEL_CLASSES = {
+    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+}
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+def to_list(tensor):
+    return tensor.detach().cpu().tolist()
+
+def train(args, train_dataset, model, tokenizer):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {'input_ids':       batch[0],
+                      'token_type_ids':  None if args.model_type == 'xlm' else batch[1],  # XLM don't use segment_ids
+                      'attention_mask':  batch[2],
+                      'start_positions': batch[3],
+                      'end_positions':   batch[4]}
+            if args.model_type in ['xlnet', 'xlm']:
+                inputs.update({'cls_index': batch[5],
+                               'p_mask':    batch[6]})
+            ouputs = model(**inputs)
+            loss = ouputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+            else:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                scheduler.step()  # Update learning rate schedule
+                optimizer.step()
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
+
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Eval!
+    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info("  Num examples = %d", len(dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    all_results = []
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        model.eval()
+        batch = tuple(t.to(args.device) for t in batch)
+        with torch.no_grad():
+            inputs = {'input_ids':      batch[0],
+                      'token_type_ids': None if args.model_type == 'xlm' else batch[1],  # XLM don't use segment_ids
+                      'attention_mask': batch[2]}
+            example_indices = batch[3]
+            if args.model_type in ['xlnet', 'xlm']:
+                inputs.update({'cls_index': batch[4],
+                               'p_mask':    batch[5]})
+            outputs = model(**inputs)
+
+        for i, example_index in enumerate(example_indices):
+            eval_feature = features[example_index.item()]
+            unique_id = int(eval_feature.unique_id)
+            if args.model_type in ['xlnet', 'xlm']:
+                # XLNet uses a more complex post-processing procedure
+                result = RawResultExtended(unique_id            = unique_id,
+                                           start_top_log_probs  = to_list(outputs[0][i]),
+                                           start_top_index      = to_list(outputs[1][i]),
+                                           end_top_log_probs    = to_list(outputs[2][i]),
+                                           end_top_index        = to_list(outputs[3][i]),
+                                           cls_logits           = to_list(outputs[4][i]))
+            else:
+                result = RawResult(unique_id    = unique_id,
+                                   start_logits = to_list(outputs[0][i]),
+                                   end_logits   = to_list(outputs[1][i]))
+            all_results.append(result)
+
+    # Compute predictions
+    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
+    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
+    output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
+
+    if args.model_type in ['xlnet', 'xlm']:
+        # XLNet uses a more complex post-processing procedure
+        write_predictions_extended(examples, features, all_results, args.n_best_size,
+                        args.max_answer_length, output_prediction_file,
+                        output_nbest_file, output_null_log_odds_file, args.predict_file,
+                        model.config.start_n_top, model.config.end_n_top,
+                        args.version_2_with_negative, tokenizer, args.verbose_logging)
+    else:
+        write_predictions(examples, features, all_results, args.n_best_size,
+                        args.max_answer_length, args.do_lower_case, output_prediction_file,
+                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
+                        args.version_2_with_negative, args.null_score_diff_threshold)
+
+    # Evaluate with the official SQuAD script
+    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
+                                 pred_file=output_prediction_file,
+                                 na_prob_file=output_null_log_odds_file)
+    results = evaluate_on_squad(evaluate_options)
+    return results
+
+
+def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
+    # Load data features from cache or dataset file
+    input_file = args.predict_file if evaluate else args.train_file
+    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        str(args.max_seq_length)))
+    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", input_file)
+        examples = read_squad_examples(input_file=input_file,
+                                                is_training=not evaluate,
+                                                version_2_with_negative=args.version_2_with_negative)
+        features = convert_examples_to_features(examples=examples,
+                                                tokenizer=tokenizer,
+                                                max_seq_length=args.max_seq_length,
+                                                doc_stride=args.doc_stride,
+                                                max_query_length=args.max_query_length,
+                                                is_training=not evaluate)
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+    if evaluate:
+        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_example_index, all_cls_index, all_p_mask)
+    else:
+        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_start_positions, all_end_positions,
+                                all_cls_index, all_p_mask)
+
+    if output_examples:
+        return dataset, examples, features
+    return dataset
+

 def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
-    parser.add_argument("--bert_model", default=None, type=str, required=True,
-                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
-                        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
-                        "bert-base-multilingual-cased, bert-base-chinese.")
+    parser.add_argument("--train_file", default=None, type=str, required=True,
+                        help="SQuAD json for training. E.g., train-v1.1.json")
+    parser.add_argument("--predict_file", default=None, type=str, required=True,
+                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model checkpoints and predictions will be written.")

    ## Other parameters
-    parser.add_argument("--train_file", default=None, type=str, help="SQuAD json for training. E.g., train-v1.1.json")
-    parser.add_argument("--predict_file", default=None, type=str,
-                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+
+    parser.add_argument('--version_2_with_negative', action='store_true',
+                        help='If true, the SQuAD examples contain some that do not have an answer.')
+    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
+                        help="If null_score - best_non_null is greater than the threshold predict null.")
+
    parser.add_argument("--max_seq_length", default=384, type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
                             "longer than this will be truncated, and sequences shorter than this will be padded.")
@@ -71,65 +340,74 @@ def main():
    parser.add_argument("--max_query_length", default=64, type=int,
                        help="The maximum number of tokens for the question. Questions longer than this will "
                             "be truncated to this length.")
-    parser.add_argument("--do_train", action='store_true', help="Whether to run training.")
-    parser.add_argument("--do_predict", action='store_true', help="Whether to run eval on the dev set.")
-    parser.add_argument("--train_batch_size", default=32, type=int, help="Total batch size for training.")
-    parser.add_argument("--predict_batch_size", default=8, type=int, help="Total batch size for predictions.")
-    parser.add_argument("--learning_rate", default=5e-5, type=float, help="The initial learning rate for Adam.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
    parser.add_argument("--num_train_epochs", default=3.0, type=float,
                        help="Total number of training epochs to perform.")
-    parser.add_argument("--warmup_proportion", default=0.1, type=float,
-                        help="Proportion of training to perform linear learning rate warmup for. E.g., 0.1 = 10%% "
-                             "of training.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--n_best_size", default=20, type=int,
-                        help="The total number of n-best predictions to generate in the nbest_predictions.json "
-                             "output file.")
+                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
    parser.add_argument("--max_answer_length", default=30, type=int,
                        help="The maximum length of an answer that can be generated. This is needed because the start "
                             "and end predictions are not conditioned on one another.")
    parser.add_argument("--verbose_logging", action='store_true',
                        help="If true, all of the warnings related to data processing will be printed. "
                             "A number of warnings are expected for a normal SQuAD evaluation.")
-    parser.add_argument("--no_cuda",
-                        action='store_true',
+
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action='store_true',
                        help="Whether not to use CUDA when available")
-    parser.add_argument('--seed',
-                        type=int,
-                        default=42,
-                        help="random seed for initialization")
-    parser.add_argument('--gradient_accumulation_steps',
-                        type=int,
-                        default=1,
-                        help="Number of updates steps to accumulate before performing a backward/update pass.")
-    parser.add_argument("--do_lower_case",
-                        action='store_true',
-                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
-    parser.add_argument("--local_rank",
-                        type=int,
-                        default=-1,
-                        help="local_rank for distributed training on gpus")
-    parser.add_argument('--fp16',
-                        action='store_true',
-                        help="Whether to use 16-bit float precision instead of 32-bit")
-    parser.add_argument('--overwrite_output_dir',
-                        action='store_true',
+    parser.add_argument('--overwrite_output_dir', action='store_true',
                        help="Overwrite the content of the output directory")
-    parser.add_argument('--loss_scale',
-                        type=float, default=0,
-                        help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
-                             "0 (default value): dynamic loss scaling.\n"
-                             "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument('--version_2_with_negative',
-                        action='store_true',
-                        help='If true, the SQuAD examples contain some that do not have an answer.')
-    parser.add_argument('--null_score_diff_threshold',
-                        type=float, default=0.0,
-                        help="If null_score - best_non_null is greater than the threshold predict null.")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
    args = parser.parse_args()
-    print(args)

+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+
+    # Setup distant debugging if needed
    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
@@ -137,263 +415,105 @@ def main():
        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
        ptvsd.wait_for_attach()

+    # Setup CUDA, GPU & distributed training
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
-        n_gpu = torch.cuda.device_count()
-    else:
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
-        n_gpu = 1
-        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    args.device = device

+    # Setup logging
    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                        datefmt = '%m/%d/%Y %H:%M:%S',
                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)

-    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
-        device, n_gpu, bool(args.local_rank != -1), args.fp16))
-
-    if args.gradient_accumulation_steps < 1:
-        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
-                            args.gradient_accumulation_steps))
-
-    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps
-
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-    if not args.do_train and not args.do_predict:
-        raise ValueError("At least one of `do_train` or `do_predict` must be True.")
-
-    if args.do_train:
-        if not args.train_file:
-            raise ValueError(
-                "If `do_train` is True, then `train_file` must be specified.")
-    if args.do_predict:
-        if not args.predict_file:
-            raise ValueError(
-                "If `do_predict` is True, then `predict_file` must be specified.")
-
-    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
-        raise ValueError("Output directory () already exists and is not empty.")
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
+    # Set seed
+    set_seed(args)

+    # Load pretrained model and tokenizer
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
-    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
-    model = BertForQuestionAnswering.from_pretrained(args.bert_model)
-    if args.local_rank == 0:
-        torch.distributed.barrier()

-    if args.fp16:
-        model.half()
-    model.to(device)
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    # Distributed and parrallel training
+    model.to(args.device)
    if args.local_rank != -1:
-        model = torch.nn.parallel.DistributedDataParallel(model,
-                                                          device_ids=[args.local_rank],
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)
-    elif n_gpu > 1:
+    elif args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
    if args.do_train:
-        if args.local_rank in [-1, 0]:
-            tb_writer = SummaryWriter()
-        # Prepare data loader
-        train_examples = read_squad_examples(
-            input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
-        cached_train_features_file = args.train_file+'_{0}_{1}_{2}_{3}'.format(
-            list(filter(None, args.bert_model.split('/'))).pop(), str(args.max_seq_length), str(args.doc_stride), str(args.max_query_length))
-        try:
-            with open(cached_train_features_file, "rb") as reader:
-                train_features = pickle.load(reader)
-        except:
-            train_features = convert_examples_to_features(
-                examples=train_examples,
-                tokenizer=tokenizer,
-                max_seq_length=args.max_seq_length,
-                doc_stride=args.doc_stride,
-                max_query_length=args.max_query_length,
-                is_training=True)
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+
+    # Save the trained model and the tokenizer
    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
-                logger.info("  Saving train features into cached file %s", cached_train_features_file)
-                with open(cached_train_features_file, "wb") as writer:
-                    pickle.dump(train_features, writer)
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)

-        all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in train_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in train_features], dtype=torch.long)
-        all_start_positions = torch.tensor([f.start_position for f in train_features], dtype=torch.long)
-        all_end_positions = torch.tensor([f.end_position for f in train_features], dtype=torch.long)
-        train_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                   all_start_positions, all_end_positions)
-        if args.local_rank == -1:
-            train_sampler = RandomSampler(train_data)
-        else:
-            train_sampler = DistributedSampler(train_data)
-
-        train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=args.train_batch_size)
-        num_train_optimization_steps = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
-        # if args.local_rank != -1:
-        #     num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()
-
-        # Prepare optimizer
-        param_optimizer = list(model.named_parameters())
-
-        # hack to remove pooler, which is not used
-        # thus it produce None grad that break apex
-        param_optimizer = [n for n in param_optimizer if 'pooler' not in n[0]]
-
-        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
-        optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
-            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
-            ]
-
-        if args.fp16:
-            try:
-                from apex.optimizers import FP16_Optimizer
-                from apex.optimizers import FusedAdam
-            except ImportError:
-                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
-
-            optimizer = FusedAdam(optimizer_grouped_parameters,
-                                  lr=args.learning_rate,
-                                  bias_correction=False,
-                                  max_grad_norm=1.0)
-            if args.loss_scale == 0:
-                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
-            else:
-                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)
-            warmup_linear = WarmupLinearSchedule(warmup=args.warmup_proportion,
-                                                 t_total=num_train_optimization_steps)
-        else:
-            optimizer = BertAdam(optimizer_grouped_parameters,
-                                 lr=args.learning_rate,
-                                 warmup=args.warmup_proportion,
-                                 t_total=num_train_optimization_steps)
-
-        global_step = 0
-
-        logger.info("***** Running training *****")
-        logger.info("  Num orig examples = %d", len(train_examples))
-        logger.info("  Num split examples = %d", len(train_features))
-        logger.info("  Batch size = %d", args.train_batch_size)
-        logger.info("  Num steps = %d", num_train_optimization_steps)
-
-        model.train()
-        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
-            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])):
-                if n_gpu == 1:
-                    batch = tuple(t.to(device) for t in batch) # multi-gpu does scattering it-self
-                input_ids, input_mask, segment_ids, start_positions, end_positions = batch
-                loss = model(input_ids, segment_ids, input_mask, start_positions, end_positions)
-                if n_gpu > 1:
-                    loss = loss.mean() # mean() to average on multi-gpu.
-                if args.gradient_accumulation_steps > 1:
-                    loss = loss / args.gradient_accumulation_steps
-
-                if args.fp16:
-                    optimizer.backward(loss)
-                else:
-                    loss.backward()
-                if (step + 1) % args.gradient_accumulation_steps == 0:
-                    if args.fp16:
-                        # modify learning rate with special warm up BERT uses
-                        # if args.fp16 is False, BertAdam is used and handles this automatically
-                        lr_this_step = args.learning_rate * warmup_linear.get_lr(global_step, args.warmup_proportion)
-                        for param_group in optimizer.param_groups:
-                            param_group['lr'] = lr_this_step
-                    optimizer.step()
-                    optimizer.zero_grad()
-                    global_step += 1
-                    if args.local_rank in [-1, 0]:
-                        tb_writer.add_scalar('lr', optimizer.get_lr()[0], global_step)
-                        tb_writer.add_scalar('loss', loss.item(), global_step)
-
-    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        # Save a trained model, configuration and tokenizer
-        model_to_save = model.module if hasattr(model, 'module') else model  # Only save the model it-self
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_file = os.path.join(args.output_dir, WEIGHTS_NAME)
-        output_config_file = os.path.join(args.output_dir, CONFIG_NAME)
-
-        torch.save(model_to_save.state_dict(), output_model_file)
-        model_to_save.config.to_json_file(output_config_file)
-        tokenizer.save_vocabulary(args.output_dir)
-
-        # Load a trained model and vocabulary that you have fine-tuned
-        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)

        # Good practice: save your training arguments together with the trained model
-        output_args_file = os.path.join(args.output_dir, 'training_args.bin')
-        torch.save(args, output_args_file)
-    else:
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+
        # Load a trained model and vocabulary that you have fine-tuned
-        model = BertForQuestionAnswering.from_pretrained(args.output_dir)
-        tokenizer = BertTokenizer.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        model.to(args.device)

-    model.to(device)

-    if args.do_predict and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
-        eval_examples = read_squad_examples(
-            input_file=args.predict_file, is_training=False, version_2_with_negative=args.version_2_with_negative)
-        eval_features = convert_examples_to_features(
-            examples=eval_examples,
-            tokenizer=tokenizer,
-            max_seq_length=args.max_seq_length,
-            doc_stride=args.doc_stride,
-            max_query_length=args.max_query_length,
-            is_training=False)
+    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs

-        logger.info("***** Running predictions *****")
-        logger.info("  Num orig examples = %d", len(eval_examples))
-        logger.info("  Num split examples = %d", len(eval_features))
-        logger.info("  Batch size = %d", args.predict_batch_size)
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)

-        all_input_ids = torch.tensor([f.input_ids for f in eval_features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.input_mask for f in eval_features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features], dtype=torch.long)
-        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-        eval_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_example_index)
-        # Run prediction for full data
-        eval_sampler = SequentialSampler(eval_data)
-        eval_dataloader = DataLoader(eval_data, sampler=eval_sampler, batch_size=args.predict_batch_size)
+        for checkpoint in checkpoints:
+            # Reload the model
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)

-        model.eval()
-        all_results = []
-        logger.info("Start evaluating")
-        for input_ids, input_mask, segment_ids, example_indices in tqdm(eval_dataloader, desc="Evaluating", disable=args.local_rank not in [-1, 0]):
-            if len(all_results) % 1000 == 0:
-                logger.info("Processing example: %d" % (len(all_results)))
-            input_ids = input_ids.to(device)
-            input_mask = input_mask.to(device)
-            segment_ids = segment_ids.to(device)
-            with torch.no_grad():
-                batch_start_logits, batch_end_logits = model(input_ids, segment_ids, input_mask)
-            for i, example_index in enumerate(example_indices):
-                start_logits = batch_start_logits[i].detach().cpu().tolist()
-                end_logits = batch_end_logits[i].detach().cpu().tolist()
-                eval_feature = eval_features[example_index.item()]
-                unique_id = int(eval_feature.unique_id)
-                all_results.append(RawResult(unique_id=unique_id,
-                                             start_logits=start_logits,
-                                             end_logits=end_logits))
-        output_prediction_file = os.path.join(args.output_dir, "predictions.json")
-        output_nbest_file = os.path.join(args.output_dir, "nbest_predictions.json")
-        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds.json")
-        write_predictions(eval_examples, eval_features, all_results,
-                          args.n_best_size, args.max_answer_length,
-                          args.do_lower_case, output_prediction_file,
-                          output_nbest_file, output_null_log_odds_file, args.verbose_logging,
-                          args.version_2_with_negative, args.null_score_diff_threshold)
+            # Evaluate
+            result = evaluate(args, model, tokenizer, prefix=global_step)
+
+            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            results.update(result)
+
+    logger.info("Results: {}".format(results))
+
+    return results


 if __name__ == "__main__":
--- a/examples/single_model_scripts/run_openai_gpt.py
+++ b/examples/single_model_scripts/run_openai_gpt.py
@@ -39,8 +39,8 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)

-from pytorch_pretrained_bert import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
-                                     OpenAIAdam, cached_path, WEIGHTS_NAME, CONFIG_NAME)
+from pytorch_transformers import (OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer,
+                                     AdamW, cached_path, WEIGHTS_NAME, CONFIG_NAME)

 ROCSTORIES_URL = "https://s3.amazonaws.com/datasets.huggingface.co/ROCStories.tar.gz"

@@ -191,7 +191,7 @@ def main():
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]
        num_train_optimization_steps = len(train_dataloader) * args.num_train_epochs
-        optimizer = OpenAIAdam(optimizer_grouped_parameters,
+        optimizer = AdamW(optimizer_grouped_parameters,
                               lr=args.learning_rate,
                               warmup=args.warmup_proportion,
                               max_grad_norm=args.max_grad_norm,
--- a/examples/single_model_scripts/run_swag.py
+++ b/examples/single_model_scripts/run_swag.py
@@ -32,10 +32,10 @@ from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
 from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange

-from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
-from pytorch_pretrained_bert.modeling import BertForMultipleChoice, BertConfig
-from pytorch_pretrained_bert.optimization import BertAdam, WarmupLinearSchedule
-from pytorch_pretrained_bert.tokenization import BertTokenizer
+from pytorch_transformers.file_utils import PYTORCH_PRETRAINED_BERT_CACHE, WEIGHTS_NAME, CONFIG_NAME
+from pytorch_transformers.modeling_bert import BertForMultipleChoice, BertConfig
+from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule
+from pytorch_transformers.tokenization_bert import BertTokenizer

 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
--- a/examples/single_model_scripts/run_transfo_xl.py
+++ b/examples/single_model_scripts/run_transfo_xl.py
@@ -28,7 +28,7 @@ import math

 import torch

-from pytorch_pretrained_bert import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer
+from pytorch_transformers import TransfoXLLMHeadModel, TransfoXLCorpus, TransfoXLTokenizer

 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
--- a/examples/test_examples.py
+++ b/examples/test_examples.py
@@ -0,0 +1,111 @@
+# coding=utf-8
+# Copyright 2018 HuggingFace Inc..
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import sys
+import unittest
+import argparse
+import logging
+
+try:
+    # python 3.4+ can use builtin unittest.mock instead of mock package
+    from unittest.mock import patch
+except ImportError:
+    from mock import patch
+
+import run_glue
+import run_squad
+import run_generation
+
+logging.basicConfig(level=logging.DEBUG)
+
+logger = logging.getLogger()
+
+def get_setup_file():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('-f')
+    args = parser.parse_args()
+    return args.f
+
+class ExamplesTests(unittest.TestCase):
+
+    def test_run_glue(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        testargs = ["run_glue.py",
+                    "--data_dir=./examples/tests_samples/MRPC/",
+                    "--task_name=mrpc",
+                    "--do_train",
+                    "--do_eval",
+                    "--output_dir=./examples/tests_samples/temp_dir",
+                    "--per_gpu_train_batch_size=2",
+                    "--per_gpu_eval_batch_size=1",
+                    "--learning_rate=1e-4",
+                    "--max_steps=10",
+                    "--warmup_steps=2",
+                    "--overwrite_output_dir",
+                    "--seed=42"]
+        model_type, model_name = ("--model_type=bert",
+                                  "--model_name_or_path=bert-base-uncased")
+        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+            result = run_glue.main()
+            for value in result.values():
+                self.assertGreaterEqual(value, 0.75)
+
+    def test_run_squad(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        testargs = ["run_squad.py",
+                    "--train_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
+                    "--predict_file=./examples/tests_samples/SQUAD/dev-v2.0-small.json",
+                    "--model_name=bert-base-uncased",
+                    "--output_dir=./examples/tests_samples/temp_dir",
+                    "--max_steps=10",
+                    "--warmup_steps=2",
+                    "--do_train",
+                    "--do_eval",
+                    "--version_2_with_negative",
+                    "--learning_rate=1e-4",
+                    "--per_gpu_train_batch_size=2",
+                    "--per_gpu_eval_batch_size=1",
+                    "--overwrite_output_dir",
+                    "--seed=42"]
+        model_type, model_name = ("--model_type=bert",
+                                  "--model_name_or_path=bert-base-uncased")
+        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+            result = run_squad.main()
+            self.assertGreaterEqual(result['f1'], 30)
+            self.assertGreaterEqual(result['exact'], 30)
+
+    def test_generation(self):
+        stream_handler = logging.StreamHandler(sys.stdout)
+        logger.addHandler(stream_handler)
+
+        testargs = ["run_generation.py",
+                    "--prompt=Hello",
+                    "--length=10",
+                    "--seed=42"]
+        model_type, model_name = ("--model_type=openai-gpt",
+                                  "--model_name_or_path=openai-gpt")
+        with patch.object(sys, 'argv', testargs + [model_type, model_name]):
+            result = run_generation.main()
+            self.assertGreaterEqual(len(result), 10)
+
+if __name__ == "__main__":
+    unittest.main()
--- a/examples/tests_samples/.gitignore
+++ b/examples/tests_samples/.gitignore
@@ -0,0 +1,6 @@
+*.*
+cache*
+temp*
+!*.tsv
+!*.json
+!.gitignore
--- a/examples/tests_samples/MRPC/dev.tsv
+++ b/examples/tests_samples/MRPC/dev.tsv
@@ -0,0 +1,7 @@
+Quality	#1 ID	#2 ID	#1 String	#2 String
+1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
+0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
+0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
+1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
+1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
--- a/examples/tests_samples/MRPC/train.tsv
+++ b/examples/tests_samples/MRPC/train.tsv
@@ -0,0 +1,7 @@
+Quality	#1 ID	#2 ID	#1 String	#2 String
+1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
+0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
+0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
+1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
+0	1783137	1782659	No dates have been set for the civil or the criminal trial .	No dates have been set for the criminal or civil cases , but Shanley has pleaded not guilty .
+1	3039165	3039036	Wal-Mart said it would check all of its million-plus domestic workers to ensure they were legally employed .	It has also said it would review all of its domestic employees more than 1 million to ensure they have legal status .
--- a/examples/tests_samples/SQUAD/dev-v2.0-small.json
+++ b/examples/tests_samples/SQUAD/dev-v2.0-small.json
@@ -0,0 +1,140 @@
+{
+    "version": "v2.0",
+    "data": [{
+        "title": "Normans",
+        "paragraphs": [{
+            "qas": [{
+                "question": "In what country is Normandy located?",
+                "id": "56ddde6b9a695914005b9628",
+                "answers": [{
+                    "text": "France",
+                    "answer_start": 159
+                }],
+                "is_impossible": false
+            }, {
+                "question": "When were the Normans in Normandy?",
+                "id": "56ddde6b9a695914005b9629",
+                "answers": [{
+                    "text": "10th and 11th centuries",
+                    "answer_start": 94
+                }],
+                "is_impossible": false
+            }, {
+                "question": "From which countries did the Norse originate?",
+                "id": "56ddde6b9a695914005b962a",
+                "answers": [{
+                    "text": "Denmark, Iceland and Norway",
+                    "answer_start": 256
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "Rollo",
+                    "answer_start": 308
+                }],
+                "question": "Who did King Charles III swear fealty to?",
+                "id": "5ad39d53604f3c001a3fe8d3",
+                "answers": [],
+                "is_impossible": true
+            }, {
+                "plausible_answers": [{
+                    "text": "10th century",
+                    "answer_start": 671
+                }],
+                "question": "When did the Frankish identity emerge?",
+                "id": "5ad39d53604f3c001a3fe8d4",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "The Normans (Norman: Nourmands; French: Normands; Latin: Normanni) were the people who in the 10th and 11th centuries gave their name to Normandy, a region in France. They were descended from Norse (\"Norman\" comes from \"Norseman\") raiders and pirates from Denmark, Iceland and Norway who, under their leader Rollo, agreed to swear fealty to King Charles III of West Francia. Through generations of assimilation and mixing with the native Frankish and Roman-Gaulish populations, their descendants would gradually merge with the Carolingian-based cultures of West Francia. The distinct cultural and ethnic identity of the Normans emerged initially in the first half of the 10th century, and it continued to evolve over the succeeding centuries."
+        }, {
+            "qas": [{
+                "question": "Who was the duke in the battle of Hastings?",
+                "id": "56dddf4066d3e219004dad5f",
+                "answers": [{
+                    "text": "William the Conqueror",
+                    "answer_start": 1022
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "Antioch",
+                    "answer_start": 1295
+                }],
+                "question": "What principality did William the conquerer found?",
+                "id": "5ad3a266604f3c001a3fea2b",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "The Norman dynasty had a major political, cultural and military impact on medieval Europe and even the Near East. The Normans were famed for their martial spirit and eventually for their Christian piety, becoming exponents of the Catholic orthodoxy into which they assimilated. They adopted the Gallo-Romance language of the Frankish land they settled, their dialect becoming known as Norman, Normaund or Norman French, an important literary language. The Duchy of Normandy, which they formed by treaty with the French crown, was a great fief of medieval France, and under Richard I of Normandy was forged into a cohesive and formidable principality in feudal tenure. The Normans are noted both for their culture, such as their unique Romanesque architecture and musical traditions, and for their significant military accomplishments and innovations. Norman adventurers founded the Kingdom of Sicily under Roger II after conquering southern Italy on the Saracens and Byzantines, and an expedition on behalf of their duke, William the Conqueror, led to the Norman conquest of England at the Battle of Hastings in 1066. Norman cultural and military influence spread from these new European centres to the Crusader states of the Near East, where their prince Bohemond I founded the Principality of Antioch in the Levant, to Scotland and Wales in Great Britain, to Ireland, and to the coasts of north Africa and the Canary Islands."
+        }]
+    }, {
+        "title": "Computational_complexity_theory",
+        "paragraphs": [{
+            "qas": [{
+                "question": "What branch of theoretical computer science deals with broadly classifying computational problems by difficulty and class of relationship?",
+                "id": "56e16182e3433e1400422e28",
+                "answers": [{
+                    "text": "Computational complexity theory",
+                    "answer_start": 0
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "algorithm",
+                    "answer_start": 472
+                }],
+                "question": "What is a manual application of mathematical steps?",
+                "id": "5ad5316b5b96ef001a10ab76",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
+        }, {
+            "qas": [{
+                "question": "What measure of a computational problem broadly defines the inherent difficulty of the solution?",
+                "id": "56e16839cd28a01900c67887",
+                "answers": [{
+                    "text": "if its solution requires significant resources",
+                    "answer_start": 46
+                }],
+                "is_impossible": false
+            }, {
+                "question": "What method is used to intuitively assess or quantify the amount of resources required to solve a computational problem?",
+                "id": "56e16839cd28a01900c67888",
+                "answers": [{
+                    "text": "mathematical models of computation",
+                    "answer_start": 176
+                }],
+                "is_impossible": false
+            }, {
+                "question": "What are two basic primary resources used to guage complexity?",
+                "id": "56e16839cd28a01900c67889",
+                "answers": [{
+                    "text": "time and storage",
+                    "answer_start": 305
+                }],
+                "is_impossible": false
+            }, {
+                "plausible_answers": [{
+                    "text": "the number of gates in a circuit",
+                    "answer_start": 436
+                }],
+                "question": "What unit is measured to determine circuit simplicity?",
+                "id": "5ad532575b96ef001a10ab7f",
+                "answers": [],
+                "is_impossible": true
+            }, {
+                "plausible_answers": [{
+                    "text": "the number of processors",
+                    "answer_start": 502
+                }],
+                "question": "What number is used in perpendicular computing?",
+                "id": "5ad532575b96ef001a10ab80",
+                "answers": [],
+                "is_impossible": true
+            }],
+            "context": "A problem is regarded as inherently difficult if its solution requires significant resources, whatever the algorithm used. The theory formalizes this intuition, by introducing mathematical models of computation to study these problems and quantifying the amount of resources needed to solve them, such as time and storage. Other complexity measures are also used, such as the amount of communication (used in communication complexity), the number of gates in a circuit (used in circuit complexity) and the number of processors (used in parallel computing). One of the roles of computational complexity theory is to determine the practical limits on what computers can and cannot do."
+        }]
+    }]
+}
--- a/examples/run_classifier_dataset_utils.py
+++ b/examples/run_classifier_dataset_utils.py
@@ -21,6 +21,7 @@ import csv
 import logging
 import os
 import sys
+from io import open

 from scipy.stats import pearsonr, spearmanr
 from sklearn.metrics import matthews_corrcoef, f1_score
@@ -77,7 +78,7 @@ class DataProcessor(object):
    @classmethod
    def _read_tsv(cls, input_file, quotechar=None):
        """Reads a tab separated value file."""
-        with open(input_file, "r", encoding="utf-8") as f:
+        with open(input_file, "r", encoding="utf-8-sig") as f:
            reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
            lines = []
            for line in reader:
@@ -388,8 +389,18 @@ class WnliProcessor(DataProcessor):


 def convert_examples_to_features(examples, label_list, max_seq_length,
-                                 tokenizer, output_mode):
-    """Loads a data file into a list of `InputBatch`s."""
+                                 tokenizer, output_mode,
+                                 cls_token_at_end=False, pad_on_left=False,
+                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
+                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
+                                 cls_token_segment_id=1, pad_token_segment_id=0,
+                                 mask_padding_with_zero=True):
+    """ Loads a data file into a list of `InputBatch`s
+        `cls_token_at_end` define the location of the CLS token:
+            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+    """

    label_map = {label : i for i, label in enumerate(label_list)}

@@ -430,24 +441,36 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
        # For classification tasks, the first vector (corresponding to [CLS]) is
        # used as as the "sentence vector". Note that this only makes sense because
        # the entire model is fine-tuned.
-        tokens = ["[CLS]"] + tokens_a + ["[SEP]"]
-        segment_ids = [0] * len(tokens)
+        tokens = tokens_a + [sep_token]
+        segment_ids = [sequence_a_segment_id] * len(tokens)

        if tokens_b:
-            tokens += tokens_b + ["[SEP]"]
-            segment_ids += [1] * (len(tokens_b) + 1)
+            tokens += tokens_b + [sep_token]
+            segment_ids += [sequence_b_segment_id] * (len(tokens_b) + 1)
+
+        if cls_token_at_end:
+            tokens = tokens + [cls_token]
+            segment_ids = segment_ids + [cls_token_segment_id]
+        else:
+            tokens = [cls_token] + tokens
+            segment_ids = [cls_token_segment_id] + segment_ids

        input_ids = tokenizer.convert_tokens_to_ids(tokens)

        # The mask has 1 for real tokens and 0 for padding tokens. Only real
        # tokens are attended to.
-        input_mask = [1] * len(input_ids)
+        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

        # Zero-pad up to the sequence length.
-        padding = [0] * (max_seq_length - len(input_ids))
-        input_ids += padding
-        input_mask += padding
-        segment_ids += padding
+        padding_length = max_seq_length - len(input_ids)
+        if pad_on_left:
+            input_ids = ([pad_token] * padding_length) + input_ids
+            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
+            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
+        else:
+            input_ids = input_ids + ([pad_token] * padding_length)
+            input_mask = input_mask + ([0 if mask_padding_with_zero else 1] * padding_length)
+            segment_ids = segment_ids + ([pad_token_segment_id] * padding_length)

        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
@@ -467,8 +490,7 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
                    [str(x) for x in tokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-            logger.info(
-                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+            logger.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
            logger.info("label: %s (id = %d)" % (example.label, label_id))

        features.append(
@@ -561,6 +583,7 @@ processors = {
 output_modes = {
    "cola": "classification",
    "mnli": "classification",
+    "mnli-mm": "classification",
    "mrpc": "classification",
    "sst-2": "classification",
    "sts-b": "regression",
@@ -569,3 +592,15 @@ output_modes = {
    "rte": "classification",
    "wnli": "classification",
 }
+
+GLUE_TASKS_NUM_LABELS = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
--- a/examples/run_squad_dataset_utils.py
+++ b/examples/run_squad_dataset_utils.py
@@ -1,3 +1,4 @@
+
 # coding=utf-8
 # Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
 # Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
@@ -23,7 +24,10 @@ import math
 import collections
 from io import open

-from pytorch_pretrained_bert.tokenization import BasicTokenizer, whitespace_tokenize
+from pytorch_transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
+
+# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
+from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores

 logger = logging.getLogger(__name__)

@@ -81,6 +85,9 @@ class InputFeatures(object):
                 input_ids,
                 input_mask,
                 segment_ids,
+                 cls_index,
+                 p_mask,
+                 paragraph_len,
                 start_position=None,
                 end_position=None,
                 is_impossible=None):
@@ -93,6 +100,9 @@ class InputFeatures(object):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
+        self.cls_index = cls_index
+        self.p_mask = p_mask
+        self.paragraph_len = paragraph_len
        self.start_position = start_position
        self.end_position = end_position
        self.is_impossible = is_impossible
@@ -177,13 +187,25 @@ def read_squad_examples(input_file, is_training, version_2_with_negative):


 def convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                 doc_stride, max_query_length, is_training):
+                                 doc_stride, max_query_length, is_training,
+                                 cls_token_at_end=False,
+                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
+                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
+                                 cls_token_segment_id=0, pad_token_segment_id=0,
+                                 mask_padding_with_zero=True):
    """Loads a data file into a list of `InputBatch`s."""

    unique_id = 1000000000
+    # cnt_pos, cnt_neg = 0, 0
+    # max_N, max_M = 1024, 1024
+    # f = np.zeros((max_N, max_M), dtype=np.float32)

    features = []
    for (example_index, example) in enumerate(examples):
+
+        # if example_index % 100 == 0:
+        #     logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg)
+
        query_tokens = tokenizer.tokenize(example.question_text)

        if len(query_tokens) > max_query_length:
@@ -238,14 +260,30 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
            token_to_orig_map = {}
            token_is_max_context = {}
            segment_ids = []
-            tokens.append("[CLS]")
-            segment_ids.append(0)
+
+            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+            # Original TF implem also keep the classification token (set to 0) (not sure why...)
+            p_mask = []
+
+            # CLS token at the beginning
+            if not cls_token_at_end:
+                tokens.append(cls_token)
+                segment_ids.append(cls_token_segment_id)
+                p_mask.append(0)
+                cls_index = 0
+
+            # Query
            for token in query_tokens:
                tokens.append(token)
-                segment_ids.append(0)
-            tokens.append("[SEP]")
-            segment_ids.append(0)
+                segment_ids.append(sequence_a_segment_id)
+                p_mask.append(1)

+            # SEP token
+            tokens.append(sep_token)
+            segment_ids.append(sequence_a_segment_id)
+            p_mask.append(1)
+
+            # Paragraph
            for i in range(doc_span.length):
                split_token_index = doc_span.start + i
                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
@@ -254,29 +292,43 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                                                       split_token_index)
                token_is_max_context[len(tokens)] = is_max_context
                tokens.append(all_doc_tokens[split_token_index])
-                segment_ids.append(1)
-            tokens.append("[SEP]")
-            segment_ids.append(1)
+                segment_ids.append(sequence_b_segment_id)
+                p_mask.append(0)
+            paragraph_len = doc_span.length
+
+            # SEP token
+            tokens.append(sep_token)
+            segment_ids.append(sequence_b_segment_id)
+            p_mask.append(1)
+
+            # CLS token at the end
+            if cls_token_at_end:
+                tokens.append(cls_token)
+                segment_ids.append(cls_token_segment_id)
+                p_mask.append(0)
+                cls_index = len(tokens) - 1  # Index of classification token

            input_ids = tokenizer.convert_tokens_to_ids(tokens)

            # The mask has 1 for real tokens and 0 for padding tokens. Only real
            # tokens are attended to.
-            input_mask = [1] * len(input_ids)
+            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)

            # Zero-pad up to the sequence length.
            while len(input_ids) < max_seq_length:
-                input_ids.append(0)
-                input_mask.append(0)
-                segment_ids.append(0)
+                input_ids.append(pad_token)
+                input_mask.append(0 if mask_padding_with_zero else 1)
+                segment_ids.append(pad_token_segment_id)
+                p_mask.append(1)

            assert len(input_ids) == max_seq_length
            assert len(input_mask) == max_seq_length
            assert len(segment_ids) == max_seq_length

+            span_is_impossible = example.is_impossible
            start_position = None
            end_position = None
-            if is_training and not example.is_impossible:
+            if is_training and not span_is_impossible:
                # For training, if our document chunk does not contain an annotation
                # we throw it out, since there is nothing to predict.
                doc_start = doc_span.start
@@ -288,13 +340,16 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                if out_of_span:
                    start_position = 0
                    end_position = 0
+                    span_is_impossible = True
                else:
                    doc_offset = len(query_tokens) + 2
                    start_position = tok_start_position - doc_start + doc_offset
                    end_position = tok_end_position - doc_start + doc_offset
-            if is_training and example.is_impossible:
-                start_position = 0
-                end_position = 0
+
+            if is_training and span_is_impossible:
+                start_position = cls_index
+                end_position = cls_index
+
            if example_index < 20:
                logger.info("*** Example ***")
                logger.info("unique_id: %s" % (unique_id))
@@ -311,9 +366,9 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
                logger.info(
                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-                if is_training and example.is_impossible:
+                if is_training and span_is_impossible:
                    logger.info("impossible example")
-                if is_training and not example.is_impossible:
+                if is_training and not span_is_impossible:
                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
                    logger.info("start_position: %d" % (start_position))
                    logger.info("end_position: %d" % (end_position))
@@ -331,9 +386,12 @@ def convert_examples_to_features(examples, tokenizer, max_seq_length,
                    input_ids=input_ids,
                    input_mask=input_mask,
                    segment_ids=segment_ids,
+                    cls_index=cls_index,
+                    p_mask=p_mask,
+                    paragraph_len=paragraph_len,
                    start_position=start_position,
                    end_position=end_position,
-                    is_impossible=example.is_impossible))
+                    is_impossible=span_is_impossible))
            unique_id += 1

    return features
@@ -416,7 +474,6 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
 RawResult = collections.namedtuple("RawResult",
                                   ["unique_id", "start_logits", "end_logits"])

-
 def write_predictions(all_examples, all_features, all_results, n_best_size,
                      max_answer_length, do_lower_case, output_prediction_file,
                      output_nbest_file, output_null_log_odds_file, verbose_logging,
@@ -608,6 +665,205 @@ def write_predictions(all_examples, all_features, all_results, n_best_size,
        with open(output_null_log_odds_file, "w") as writer:
            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")

+    return all_predictions
+
+
+# For XLNet (and XLM which uses the same head)
+RawResultExtended = collections.namedtuple("RawResultExtended",
+    ["unique_id", "start_top_log_probs", "start_top_index",
+     "end_top_log_probs", "end_top_index", "cls_logits"])
+
+
+def write_predictions_extended(all_examples, all_features, all_results, n_best_size,
+                                max_answer_length, output_prediction_file,
+                                output_nbest_file,
+                                output_null_log_odds_file, orig_data_file,
+                                start_n_top, end_n_top, version_2_with_negative,
+                                tokenizer, verbose_logging):
+    """ XLNet write prediction logic (more complex than Bert's).
+        Write final predictions to the json file and log-odds of null if needed.
+
+        Requires utils_squad_evaluate.py
+    """
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction",
+        ["feature_index", "start_index", "end_index",
+        "start_log_prob", "end_log_prob"])
+
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
+
+    logger.info("Writing predictions to: %s", output_prediction_file)
+    # logger.info("Writing nbest to: %s" % (output_nbest_file))
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+
+            cur_null_score = result.cls_logits
+
+            # if we could have irrelevant answers, get the min score of irrelevant
+            score_null = min(score_null, cur_null_score)
+
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_log_prob = result.start_top_log_probs[i]
+                    start_index = result.start_top_index[i]
+
+                    j_index = i * end_n_top + j
+
+                    end_log_prob = result.end_top_log_probs[j_index]
+                    end_index = result.end_top_index[j_index]
+
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= feature.paragraph_len - 1:
+                        continue
+                    if end_index >= feature.paragraph_len - 1:
+                        continue
+
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_log_prob=start_log_prob,
+                            end_log_prob=end_log_prob))
+
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_log_prob + x.end_log_prob),
+            reverse=True)
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+
+            # XLNet un-tokenizer
+            # Let's keep it simple for now and see if we need all this later.
+            # 
+            # tok_start_to_orig_index = feature.tok_start_to_orig_index
+            # tok_end_to_orig_index = feature.tok_end_to_orig_index
+            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
+            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
+            # paragraph_text = example.paragraph_text
+            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
+
+            # Previously used Bert untokenizer
+            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
+                                        verbose_logging)
+
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_log_prob=pred.start_log_prob,
+                    end_log_prob=pred.end_log_prob))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="", start_log_prob=-1e6,
+                end_log_prob=-1e6))
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_log_prob + entry.end_log_prob)
+            if not best_non_null_entry:
+                best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_log_prob"] = entry.start_log_prob
+            output["end_log_prob"] = entry.end_log_prob
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+        assert best_non_null_entry is not None
+
+        score_diff = score_null
+        scores_diff_json[example.qas_id] = score_diff
+        # note(zhiliny): always predict best_non_null_entry
+        # and the evaluation script will search for the best threshold
+        all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    with open(orig_data_file, "r", encoding='utf-8') as reader:
+        orig_data = json.load(reader)["data"]
+
+    qid_to_has_ans = make_qid_to_has_ans(orig_data)
+    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+    exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions)
+    out_eval = {}
+
+    find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans)
+
+    return out_eval
+

 def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
    """Project the tokenized prediction back to the original text."""
--- a/examples/utils_squad_evaluate.py
+++ b/examples/utils_squad_evaluate.py
@@ -0,0 +1,330 @@
+""" Official evaluation script for SQuAD version 2.0.
+    Modified by XLNet authors to update `find_best_threshold` scripts for SQuAD V2.0
+
+In addition to basic functionality, we also compute additional statistics and
+plot precision-recall curves if an additional na_prob.json file is provided.
+This file is expected to map question ID's to the model's predicted probability
+that a question is unanswerable.
+"""
+import argparse
+import collections
+import json
+import numpy as np
+import os
+import re
+import string
+import sys
+
+class EVAL_OPTS():
+  def __init__(self, data_file, pred_file, out_file="",
+               na_prob_file="na_prob.json", na_prob_thresh=1.0,
+               out_image_dir=None, verbose=False):
+    self.data_file = data_file
+    self.pred_file = pred_file
+    self.out_file = out_file
+    self.na_prob_file = na_prob_file
+    self.na_prob_thresh = na_prob_thresh
+    self.out_image_dir = out_image_dir
+    self.verbose = verbose
+
+OPTS = None
+
+def parse_args():
+  parser = argparse.ArgumentParser('Official evaluation script for SQuAD version 2.0.')
+  parser.add_argument('data_file', metavar='data.json', help='Input data JSON file.')
+  parser.add_argument('pred_file', metavar='pred.json', help='Model predictions.')
+  parser.add_argument('--out-file', '-o', metavar='eval.json',
+                      help='Write accuracy metrics to file (default is stdout).')
+  parser.add_argument('--na-prob-file', '-n', metavar='na_prob.json',
+                      help='Model estimates of probability of no answer.')
+  parser.add_argument('--na-prob-thresh', '-t', type=float, default=1.0,
+                      help='Predict "" if no-answer probability exceeds this (default = 1.0).')
+  parser.add_argument('--out-image-dir', '-p', metavar='out_images', default=None,
+                      help='Save precision-recall curves to directory.')
+  parser.add_argument('--verbose', '-v', action='store_true')
+  if len(sys.argv) == 1:
+    parser.print_help()
+    sys.exit(1)
+  return parser.parse_args()
+
+def make_qid_to_has_ans(dataset):
+  qid_to_has_ans = {}
+  for article in dataset:
+    for p in article['paragraphs']:
+      for qa in p['qas']:
+        qid_to_has_ans[qa['id']] = bool(qa['answers'])
+  return qid_to_has_ans
+
+def normalize_answer(s):
+  """Lower text and remove punctuation, articles and extra whitespace."""
+  def remove_articles(text):
+    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
+    return re.sub(regex, ' ', text)
+  def white_space_fix(text):
+    return ' '.join(text.split())
+  def remove_punc(text):
+    exclude = set(string.punctuation)
+    return ''.join(ch for ch in text if ch not in exclude)
+  def lower(text):
+    return text.lower()
+  return white_space_fix(remove_articles(remove_punc(lower(s))))
+
+def get_tokens(s):
+  if not s: return []
+  return normalize_answer(s).split()
+
+def compute_exact(a_gold, a_pred):
+  return int(normalize_answer(a_gold) == normalize_answer(a_pred))
+
+def compute_f1(a_gold, a_pred):
+  gold_toks = get_tokens(a_gold)
+  pred_toks = get_tokens(a_pred)
+  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
+  num_same = sum(common.values())
+  if len(gold_toks) == 0 or len(pred_toks) == 0:
+    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
+    return int(gold_toks == pred_toks)
+  if num_same == 0:
+    return 0
+  precision = 1.0 * num_same / len(pred_toks)
+  recall = 1.0 * num_same / len(gold_toks)
+  f1 = (2 * precision * recall) / (precision + recall)
+  return f1
+
+def get_raw_scores(dataset, preds):
+  exact_scores = {}
+  f1_scores = {}
+  for article in dataset:
+    for p in article['paragraphs']:
+      for qa in p['qas']:
+        qid = qa['id']
+        gold_answers = [a['text'] for a in qa['answers']
+                        if normalize_answer(a['text'])]
+        if not gold_answers:
+          # For unanswerable questions, only correct answer is empty string
+          gold_answers = ['']
+        if qid not in preds:
+          print('Missing prediction for %s' % qid)
+          continue
+        a_pred = preds[qid]
+        # Take max over all gold answers
+        exact_scores[qid] = max(compute_exact(a, a_pred) for a in gold_answers)
+        f1_scores[qid] = max(compute_f1(a, a_pred) for a in gold_answers)
+  return exact_scores, f1_scores
+
+def apply_no_ans_threshold(scores, na_probs, qid_to_has_ans, na_prob_thresh):
+  new_scores = {}
+  for qid, s in scores.items():
+    pred_na = na_probs[qid] > na_prob_thresh
+    if pred_na:
+      new_scores[qid] = float(not qid_to_has_ans[qid])
+    else:
+      new_scores[qid] = s
+  return new_scores
+
+def make_eval_dict(exact_scores, f1_scores, qid_list=None):
+  if not qid_list:
+    total = len(exact_scores)
+    return collections.OrderedDict([
+        ('exact', 100.0 * sum(exact_scores.values()) / total),
+        ('f1', 100.0 * sum(f1_scores.values()) / total),
+        ('total', total),
+    ])
+  else:
+    total = len(qid_list)
+    return collections.OrderedDict([
+        ('exact', 100.0 * sum(exact_scores[k] for k in qid_list) / total),
+        ('f1', 100.0 * sum(f1_scores[k] for k in qid_list) / total),
+        ('total', total),
+    ])
+
+def merge_eval(main_eval, new_eval, prefix):
+  for k in new_eval:
+    main_eval['%s_%s' % (prefix, k)] = new_eval[k]
+
+def plot_pr_curve(precisions, recalls, out_image, title):
+  plt.step(recalls, precisions, color='b', alpha=0.2, where='post')
+  plt.fill_between(recalls, precisions, step='post', alpha=0.2, color='b')
+  plt.xlabel('Recall')
+  plt.ylabel('Precision')
+  plt.xlim([0.0, 1.05])
+  plt.ylim([0.0, 1.05])
+  plt.title(title)
+  plt.savefig(out_image)
+  plt.clf()
+
+def make_precision_recall_eval(scores, na_probs, num_true_pos, qid_to_has_ans,
+                               out_image=None, title=None):
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  true_pos = 0.0
+  cur_p = 1.0
+  cur_r = 0.0
+  precisions = [1.0]
+  recalls = [0.0]
+  avg_prec = 0.0
+  for i, qid in enumerate(qid_list):
+    if qid_to_has_ans[qid]:
+      true_pos += scores[qid]
+    cur_p = true_pos / float(i+1)
+    cur_r = true_pos / float(num_true_pos)
+    if i == len(qid_list) - 1 or na_probs[qid] != na_probs[qid_list[i+1]]:
+      # i.e., if we can put a threshold after this point
+      avg_prec += cur_p * (cur_r - recalls[-1])
+      precisions.append(cur_p)
+      recalls.append(cur_r)
+  if out_image:
+    plot_pr_curve(precisions, recalls, out_image, title)
+  return {'ap': 100.0 * avg_prec}
+
+def run_precision_recall_analysis(main_eval, exact_raw, f1_raw, na_probs, 
+                                  qid_to_has_ans, out_image_dir):
+  if out_image_dir and not os.path.exists(out_image_dir):
+    os.makedirs(out_image_dir)
+  num_true_pos = sum(1 for v in qid_to_has_ans.values() if v)
+  if num_true_pos == 0:
+    return
+  pr_exact = make_precision_recall_eval(
+      exact_raw, na_probs, num_true_pos, qid_to_has_ans,
+      out_image=os.path.join(out_image_dir, 'pr_exact.png'),
+      title='Precision-Recall curve for Exact Match score')
+  pr_f1 = make_precision_recall_eval(
+      f1_raw, na_probs, num_true_pos, qid_to_has_ans,
+      out_image=os.path.join(out_image_dir, 'pr_f1.png'),
+      title='Precision-Recall curve for F1 score')
+  oracle_scores = {k: float(v) for k, v in qid_to_has_ans.items()}
+  pr_oracle = make_precision_recall_eval(
+      oracle_scores, na_probs, num_true_pos, qid_to_has_ans,
+      out_image=os.path.join(out_image_dir, 'pr_oracle.png'),
+      title='Oracle Precision-Recall curve (binary task of HasAns vs. NoAns)')
+  merge_eval(main_eval, pr_exact, 'pr_exact')
+  merge_eval(main_eval, pr_f1, 'pr_f1')
+  merge_eval(main_eval, pr_oracle, 'pr_oracle')
+
+def histogram_na_prob(na_probs, qid_list, image_dir, name):
+  if not qid_list:
+    return
+  x = [na_probs[k] for k in qid_list]
+  weights = np.ones_like(x) / float(len(x))
+  plt.hist(x, weights=weights, bins=20, range=(0.0, 1.0))
+  plt.xlabel('Model probability of no-answer')
+  plt.ylabel('Proportion of dataset')
+  plt.title('Histogram of no-answer probability: %s' % name)
+  plt.savefig(os.path.join(image_dir, 'na_prob_hist_%s.png' % name))
+  plt.clf()
+
+def find_best_thresh(preds, scores, na_probs, qid_to_has_ans):
+  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+  cur_score = num_no_ans
+  best_score = cur_score
+  best_thresh = 0.0
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  for i, qid in enumerate(qid_list):
+    if qid not in scores: continue
+    if qid_to_has_ans[qid]:
+      diff = scores[qid]
+    else:
+      if preds[qid]:
+        diff = -1
+      else:
+        diff = 0
+    cur_score += diff
+    if cur_score > best_score:
+      best_score = cur_score
+      best_thresh = na_probs[qid]
+  return 100.0 * best_score / len(scores), best_thresh
+
+def find_best_thresh_v2(preds, scores, na_probs, qid_to_has_ans):
+  num_no_ans = sum(1 for k in qid_to_has_ans if not qid_to_has_ans[k])
+  cur_score = num_no_ans
+  best_score = cur_score
+  best_thresh = 0.0
+  qid_list = sorted(na_probs, key=lambda k: na_probs[k])
+  for i, qid in enumerate(qid_list):
+    if qid not in scores: continue
+    if qid_to_has_ans[qid]:
+      diff = scores[qid]
+    else:
+      if preds[qid]:
+        diff = -1
+      else:
+        diff = 0
+    cur_score += diff
+    if cur_score > best_score:
+      best_score = cur_score
+      best_thresh = na_probs[qid]
+
+  has_ans_score, has_ans_cnt = 0, 0
+  for qid in qid_list:
+    if not qid_to_has_ans[qid]: continue
+    has_ans_cnt += 1
+
+    if qid not in scores: continue
+    has_ans_score += scores[qid]
+
+  return 100.0 * best_score / len(scores), best_thresh, 1.0 * has_ans_score / has_ans_cnt
+
+def find_all_best_thresh(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+  best_exact, exact_thresh = find_best_thresh(preds, exact_raw, na_probs, qid_to_has_ans)
+  best_f1, f1_thresh = find_best_thresh(preds, f1_raw, na_probs, qid_to_has_ans)
+  main_eval['best_exact'] = best_exact
+  main_eval['best_exact_thresh'] = exact_thresh
+  main_eval['best_f1'] = best_f1
+  main_eval['best_f1_thresh'] = f1_thresh
+
+def find_all_best_thresh_v2(main_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans):
+  best_exact, exact_thresh, has_ans_exact = find_best_thresh_v2(preds, exact_raw, na_probs, qid_to_has_ans)
+  best_f1, f1_thresh, has_ans_f1 = find_best_thresh_v2(preds, f1_raw, na_probs, qid_to_has_ans)
+  main_eval['best_exact'] = best_exact
+  main_eval['best_exact_thresh'] = exact_thresh
+  main_eval['best_f1'] = best_f1
+  main_eval['best_f1_thresh'] = f1_thresh
+  main_eval['has_ans_exact'] = has_ans_exact
+  main_eval['has_ans_f1'] = has_ans_f1
+
+def main(OPTS):
+  with open(OPTS.data_file) as f:
+    dataset_json = json.load(f)
+    dataset = dataset_json['data']
+  with open(OPTS.pred_file) as f:
+    preds = json.load(f)
+  if OPTS.na_prob_file:
+    with open(OPTS.na_prob_file) as f:
+      na_probs = json.load(f)
+  else:
+    na_probs = {k: 0.0 for k in preds}
+  qid_to_has_ans = make_qid_to_has_ans(dataset)  # maps qid to True/False
+  has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+  no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+  exact_raw, f1_raw = get_raw_scores(dataset, preds)
+  exact_thresh = apply_no_ans_threshold(exact_raw, na_probs, qid_to_has_ans,
+                                        OPTS.na_prob_thresh)
+  f1_thresh = apply_no_ans_threshold(f1_raw, na_probs, qid_to_has_ans,
+                                     OPTS.na_prob_thresh)
+  out_eval = make_eval_dict(exact_thresh, f1_thresh)
+  if has_ans_qids:
+    has_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=has_ans_qids)
+    merge_eval(out_eval, has_ans_eval, 'HasAns')
+  if no_ans_qids:
+    no_ans_eval = make_eval_dict(exact_thresh, f1_thresh, qid_list=no_ans_qids)
+    merge_eval(out_eval, no_ans_eval, 'NoAns')
+  if OPTS.na_prob_file:
+    find_all_best_thresh(out_eval, preds, exact_raw, f1_raw, na_probs, qid_to_has_ans)
+  if OPTS.na_prob_file and OPTS.out_image_dir:
+    run_precision_recall_analysis(out_eval, exact_raw, f1_raw, na_probs, 
+                                  qid_to_has_ans, OPTS.out_image_dir)
+    histogram_na_prob(na_probs, has_ans_qids, OPTS.out_image_dir, 'hasAns')
+    histogram_na_prob(na_probs, no_ans_qids, OPTS.out_image_dir, 'noAns')
+  if OPTS.out_file:
+    with open(OPTS.out_file, 'w') as f:
+      json.dump(out_eval, f)
+  else:
+    print(json.dumps(out_eval, indent=2))
+  return out_eval
+
+if __name__ == '__main__':
+  OPTS = parse_args()
+  if OPTS.out_image_dir:
+    import matplotlib
+    matplotlib.use('Agg')
+    import matplotlib.pyplot as plt 
+  main(OPTS)
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization import BertTokenizer
-from pytorch_pretrained_bert.modeling import (
+from pytorch_transformers.tokenization_bert import BertTokenizer
+from pytorch_transformers.modeling_bert import (
        BertModel,
        BertForNextSentencePrediction,
        BertForMaskedLM,
@@ -86,7 +86,7 @@ def bertTokenizer(*args, **kwargs):
    Example:
        >>> import torch
        >>> sentence = 'Hello, World!'
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        >>> toks = tokenizer.tokenize(sentence)
        ['Hello', '##,', 'World', '##!']
        >>> ids = tokenizer.convert_tokens_to_ids(toks)
@@ -106,7 +106,7 @@ def bertModel(*args, **kwargs):
    Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
@@ -115,7 +115,7 @@ def bertModel(*args, **kwargs):
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertModel', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
        >>> model.eval()
        # Predict hidden states features for each layer
        >>> with torch.no_grad():
@@ -135,7 +135,7 @@ def bertForNextSentencePrediction(*args, **kwargs):
    Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
@@ -144,7 +144,7 @@ def bertForNextSentencePrediction(*args, **kwargs):
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForNextSentencePrediction
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForNextSentencePrediction', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
        >>> model.eval()
        # Predict the next sentence classification logits
        >>> with torch.no_grad():
@@ -165,7 +165,7 @@ def bertForPreTraining(*args, **kwargs):
    Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
@@ -173,7 +173,7 @@ def bertForPreTraining(*args, **kwargs):
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForPreTraining
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForPreTraining', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
        >>> masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
    """
    model = BertForPreTraining.from_pretrained(*args, **kwargs)
@@ -189,7 +189,7 @@ def bertForMaskedLM(*args, **kwargs):
    Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
@@ -200,7 +200,7 @@ def bertForMaskedLM(*args, **kwargs):
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForMaskedLM
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMaskedLM', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
        >>> model.eval()
        # Predict all tokens
        >>> with torch.no_grad():
@@ -231,7 +231,7 @@ def bertForSequenceClassification(*args, **kwargs):
    Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
@@ -240,7 +240,7 @@ def bertForSequenceClassification(*args, **kwargs):
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForSequenceClassification
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
        >>> model.eval()
        # Predict the sequence classification logits
        >>> with torch.no_grad():
@@ -266,7 +266,7 @@ def bertForMultipleChoice(*args, **kwargs):
    Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
@@ -275,7 +275,7 @@ def bertForMultipleChoice(*args, **kwargs):
        >>> tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
        >>> segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
        # Load bertForMultipleChoice
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
        >>> model.eval()
        # Predict the multiple choice logits
        >>> with torch.no_grad():
@@ -299,7 +299,7 @@ def bertForQuestionAnswering(*args, **kwargs):
    Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
@@ -308,7 +308,7 @@ def bertForQuestionAnswering(*args, **kwargs):
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForQuestionAnswering
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForQuestionAnswering', 'bert-base-cased')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
        >>> model.eval()
        # Predict the start and end positions logits
        >>> with torch.no_grad():
@@ -338,7 +338,7 @@ def bertForTokenClassification(*args, **kwargs):
    Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
        #  Prepare tokenized input
        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
        >>> tokenized_text = tokenizer.tokenize(text)
@@ -347,7 +347,7 @@ def bertForTokenClassification(*args, **kwargs):
        >>> tokens_tensor = torch.tensor([indexed_tokens])
        >>> segments_tensors = torch.tensor([segments_ids])
        # Load bertForTokenClassification
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
        >>> model.eval()
        # Predict the token classification logits
        >>> with torch.no_grad():
--- a/hubconfs/gpt2_hubconf.py
+++ b/hubconfs/gpt2_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_gpt2 import GPT2Tokenizer
-from pytorch_pretrained_bert.modeling_gpt2 import (
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer
+from pytorch_transformers.modeling_gpt2 import (
    GPT2Model,
    GPT2LMHeadModel,
    GPT2DoubleHeadsModel
@@ -53,7 +53,7 @@ def gpt2Tokenizer(*args, **kwargs):

    Example:
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')

        >>> text = "Who was Jim Henson ?"
        >>> indexed_tokens = tokenizer.encode(tokenized_text)
@@ -72,7 +72,7 @@ def gpt2Model(*args, **kwargs):
    Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')

        #  Prepare tokenized input
        >>> text_1 = "Who was Jim Henson ?"
@@ -83,7 +83,7 @@ def gpt2Model(*args, **kwargs):
        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])

        # Load gpt2Model
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Model', 'gpt2')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
        >>> model.eval()

        # Predict hidden states features for each layer
@@ -105,7 +105,7 @@ def gpt2LMHeadModel(*args, **kwargs):
    Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')

        #  Prepare tokenized input
        >>> text_1 = "Who was Jim Henson ?"
@@ -116,7 +116,7 @@ def gpt2LMHeadModel(*args, **kwargs):
        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])

        # Load gpt2LMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2LMHeadModel', 'gpt2')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
        >>> model.eval()

        # Predict hidden states features for each layer
@@ -144,7 +144,7 @@ def gpt2DoubleHeadsModel(*args, **kwargs):
    Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2Tokenizer', 'gpt2')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')

        #  Prepare tokenized input
        >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -157,7 +157,7 @@ def gpt2DoubleHeadsModel(*args, **kwargs):
        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])

        # Load gpt2DoubleHeadsModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'gpt2DoubleHeadsModel', 'gpt2')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
        >>> model.eval()

        # Predict hidden states features for each layer
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_openai import OpenAIGPTTokenizer
-from pytorch_pretrained_bert.modeling_openai import (
+from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer
+from pytorch_transformers.modeling_openai import (
 	OpenAIGPTModel,
 	OpenAIGPTLMHeadModel,
 	OpenAIGPTDoubleHeadsModel
@@ -77,7 +77,7 @@ def openAIGPTTokenizer(*args, **kwargs):

    Example:
 		>>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 		
 		>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
        >>> tokenized_text = tokenizer.tokenize(text)
@@ -98,7 +98,7 @@ def openAIGPTModel(*args, **kwargs):
    Example:
        # Load the tokenizer
 		>>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')

        #  Prepare tokenized input
        >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -107,7 +107,7 @@ def openAIGPTModel(*args, **kwargs):
        >>> tokens_tensor = torch.tensor([indexed_tokens])

        # Load openAIGPTModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTModel', 'openai-gpt')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
        >>> model.eval()

        # Predict hidden states features for each layer
@@ -127,7 +127,7 @@ def openAIGPTLMHeadModel(*args, **kwargs):
 	Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')

        #  Prepare tokenized input
        >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -136,7 +136,7 @@ def openAIGPTLMHeadModel(*args, **kwargs):
        >>> tokens_tensor = torch.tensor([indexed_tokens])

        # Load openAIGPTLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTLMHeadModel', 'openai-gpt')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
        >>> model.eval()

        # Predict hidden states features for each layer
@@ -162,7 +162,7 @@ def openAIGPTDoubleHeadsModel(*args, **kwargs):
 	Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTTokenizer', 'openai-gpt')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')

        #  Prepare tokenized input
        >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
@@ -175,7 +175,7 @@ def openAIGPTDoubleHeadsModel(*args, **kwargs):
        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])

        # Load openAIGPTDoubleHeadsModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
        >>> model.eval()

        # Predict hidden states features for each layer
--- a/hubconfs/transformer_xl_hubconf.py
+++ b/hubconfs/transformer_xl_hubconf.py
@@ -1,5 +1,5 @@
-from pytorch_pretrained_bert.tokenization_transfo_xl import TransfoXLTokenizer
-from pytorch_pretrained_bert.modeling_transfo_xl import (
+from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer
+from pytorch_transformers.modeling_transfo_xl import (
    TransfoXLModel,
    TransfoXLLMHeadModel
 )
@@ -46,7 +46,7 @@ def transformerXLTokenizer(*args, **kwargs):

    Example:
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
        
        >>> text = "Who was Jim Henson ?"
        >>> tokenized_text = tokenizer.tokenize(tokenized_text)
@@ -64,7 +64,7 @@ def transformerXLModel(*args, **kwargs):
    Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')

        #  Prepare tokenized input
        >>> text_1 = "Who was Jim Henson ?"
@@ -77,7 +77,7 @@ def transformerXLModel(*args, **kwargs):
        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])

        # Load transformerXLModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLModel', 'transfo-xl-wt103')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLModel', 'transfo-xl-wt103')
        >>> model.eval()

        # Predict hidden states features for each layer
@@ -99,7 +99,7 @@ def transformerXLLMHeadModel(*args, **kwargs):
    Example:
        # Load the tokenizer
        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')

        #  Prepare tokenized input
        >>> text_1 = "Who was Jim Henson ?"
@@ -112,7 +112,7 @@ def transformerXLLMHeadModel(*args, **kwargs):
        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])

        # Load transformerXLLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-pretrained-BERT', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
        >>> model.eval()

        # Predict hidden states features for each layer
--- a/hubconfs/xlm_hubconf.py
+++ b/hubconfs/xlm_hubconf.py
@@ -0,0 +1,167 @@
+from pytorch_transformers.tokenization_xlm import XLMTokenizer
+from pytorch_transformers.modeling_xlm import (
+    XLMConfig,
+    XLMModel,
+    XLMWithLMHeadModel,
+    XLMForSequenceClassification,
+    XLMForQuestionAnswering
+)
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+xlm_start_docstring = """
+    Model class adapted from the XLM Transformer model of
+        "Cross-lingual Language Model Pretraining" by Guillaume Lample, Alexis Conneau
+        Paper: https://arxiv.org/abs/1901.07291
+        Original code: https://github.com/facebookresearch/XLM
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+"""
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+xlm_end_docstring = """
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `xlm-mlm-en-2048`
+            - a path or url to a pretrained model archive containing:
+                . `config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump created using the `convert_xlm_checkpoint_to_pytorch` conversion script
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
+        *inputs, **kwargs: additional input for the specific XLM class
+"""
+
+
+def _begin_with_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+def _end_with_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def xlmTokenizer(*args, **kwargs):
+    """
+    Instantiate a XLM BPE tokenizer for XLM from a pre-trained vocab file.
+
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * xlm-mlm-en-2048
+    Keyword args:
+    special_tokens: Special tokens in vocabulary that are not pretrained
+                    Default: None
+    max_len: An artificial maximum length to truncate tokenized sequences to;
+             Effective maximum length is always the minimum of this
+             value (if specified) and the underlying model's
+             sequence length.
+             Default: None
+
+    Example:
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
+
+        >>> text = "Who was Jim Henson ?"
+        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+    """
+    tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_begin_with_docstring(xlm_start_docstring)
+@_end_with_docstring(xlm_end_docstring)
+def xlmModel(*args, **kwargs):
+    """
+        # Load xlmModel
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                hidden_states_1, mems = model(tokens_tensor_1)
+                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
+    """
+    model = XLMModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_begin_with_docstring(xlm_start_docstring)
+@_end_with_docstring(xlm_end_docstring)
+def xlmLMHeadModel(*args, **kwargs):
+    """
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load xlnetLMHeadModel
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                predictions_1, mems = model(tokens_tensor_1)
+                predictions_2, mems = model(tokens_tensor_2, mems=mems)
+
+        # Get the predicted last token
+        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        >>> predicted_token = tokenizer.decode([predicted_index])
+        >>> assert predicted_token == ' who'
+    """
+    model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+# @_end_with_docstring(xlnet_docstring)
+# def xlnetForSequenceClassification(*args, **kwargs):
+#     """
+#     xlnetModel is the basic XLNet Transformer model from
+#         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
+#         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
+
+#     Example:
+#         # Load the tokenizer
+#         >>> import torch
+#         >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
+
+#         #  Prepare tokenized input
+#         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+#         >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+#         >>> tokenized_text1 = tokenizer.tokenize(text1)
+#         >>> tokenized_text2 = tokenizer.tokenize(text2)
+#         >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+#         >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+#         >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+#         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+#         # Load xlnetForSequenceClassification
+#         >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
+#         >>> model.eval()
+
+#         # Predict sequence classes logits
+#         >>> with torch.no_grad():
+#                 lm_logits, mems = model(tokens_tensor)
+#     """
+#     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
+#     return model
--- a/hubconfs/xlnet_hubconf.1.py
+++ b/hubconfs/xlnet_hubconf.1.py
@@ -0,0 +1,169 @@
+from pytorch_transformers.tokenization_xlnet import XLNetTokenizer
+from pytorch_transformers.modeling_xlnet import (
+    XLNetConfig,
+    XLNetModel,
+    XLNetLMHeadModel,
+    # XLNetForSequenceClassification
+)
+
+# A lot of models share the same param doc. Use a decorator
+# to save typing
+xlnet_docstring = """
+    Params:
+        pretrained_model_name_or_path: either:
+            - a str with the name of a pre-trained model to load selected in the list of:
+                . `xlnet-large-cased`
+            - a path or url to a pretrained model archive containing:
+                . `config.json` a configuration file for the model
+                . `pytorch_model.bin` a PyTorch dump of a XLNetForPreTraining instance
+            - a path or url to a pretrained model archive containing:
+                . `xlnet_config.json` a configuration file for the model
+                . `model.chkpt` a TensorFlow checkpoint
+        from_tf: should we load the weights from a locally saved TensorFlow checkpoint
+        cache_dir: an optional path to a folder in which the pre-trained models will be cached.
+        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
+        *inputs, **kwargs: additional input for the specific XLNet class
+"""
+
+
+def _append_from_pretrained_docstring(docstr):
+    def docstring_decorator(fn):
+        fn.__doc__ = fn.__doc__ + docstr
+        return fn
+    return docstring_decorator
+
+
+def xlnetTokenizer(*args, **kwargs):
+    """
+    Instantiate a XLNet sentencepiece tokenizer for XLNet from a pre-trained vocab file.
+    Peculiarities:
+        - require Google sentencepiece (https://github.com/google/sentencepiece)
+
+    Args:
+    pretrained_model_name_or_path: Path to pretrained model archive
+                                   or one of pre-trained vocab configs below.
+                                       * xlnet-large-cased
+    Keyword args:
+    special_tokens: Special tokens in vocabulary that are not pretrained
+                    Default: None
+    max_len: An artificial maximum length to truncate tokenized sequences to;
+             Effective maximum length is always the minimum of this
+             value (if specified) and the underlying model's
+             sequence length.
+             Default: None
+
+    Example:
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
+
+        >>> text = "Who was Jim Henson ?"
+        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+    """
+    tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
+    return tokenizer
+
+
+@_append_from_pretrained_docstring(xlnet_docstring)
+def xlnetModel(*args, **kwargs):
+    """
+    xlnetModel is the basic XLNet Transformer model from
+        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
+        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load xlnetModel
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                hidden_states_1, mems = model(tokens_tensor_1)
+                hidden_states_2, mems = model(tokens_tensor_2, past=mems)
+    """
+    model = XLNetModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+@_append_from_pretrained_docstring(xlnet_docstring)
+def xlnetLMHeadModel(*args, **kwargs):
+    """
+    xlnetModel is the basic XLNet Transformer model from
+        "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
+        by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
+    with a tied (pre-trained) language modeling head on top.
+
+    Example:
+        # Load the tokenizer
+        >>> import torch
+        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
+
+        #  Prepare tokenized input
+        >>> text_1 = "Who was Jim Henson ?"
+        >>> text_2 = "Jim Henson was a puppeteer"
+        >>> indexed_tokens_1 = tokenizer.encode(text_1)
+        >>> indexed_tokens_2 = tokenizer.encode(text_2)
+        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+
+        # Load xlnetLMHeadModel
+        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
+        >>> model.eval()
+
+        # Predict hidden states features for each layer
+        >>> with torch.no_grad():
+                predictions_1, mems = model(tokens_tensor_1)
+                predictions_2, mems = model(tokens_tensor_2, mems=mems)
+
+        # Get the predicted last token
+        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        >>> predicted_token = tokenizer.decode([predicted_index])
+        >>> assert predicted_token == ' who'
+    """
+    model = XLNetLMHeadModel.from_pretrained(*args, **kwargs)
+    return model
+
+
+# @_append_from_pretrained_docstring(xlnet_docstring)
+# def xlnetForSequenceClassification(*args, **kwargs):
+#     """
+#     xlnetModel is the basic XLNet Transformer model from
+#         "XLNet: Generalized Autoregressive Pretraining for Language Understanding"
+#         by Zhilin Yang, Zihang Dai1, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le
+
+#     Example:
+#         # Load the tokenizer
+#         >>> import torch
+#         >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
+
+#         #  Prepare tokenized input
+#         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+#         >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+#         >>> tokenized_text1 = tokenizer.tokenize(text1)
+#         >>> tokenized_text2 = tokenizer.tokenize(text2)
+#         >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+#         >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+#         >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+#         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+
+#         # Load xlnetForSequenceClassification
+#         >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
+#         >>> model.eval()
+
+#         # Predict sequence classes logits
+#         >>> with torch.no_grad():
+#                 lm_logits, mems = model(tokens_tensor)
+#     """
+#     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
+#     return model
--- a/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
+++ b/notebooks/Comparing-TF-and-PT-models-MLM-NSP.ipynb
@@ -78,7 +78,7 @@
    "import importlib.util\n",
    "import sys\n",
    "import tensorflow as tf\n",
-    "import pytorch_pretrained_bert as ppb\n",
+    "import pytorch_transformers as ppb\n",
    "\n",
    "def del_all_flags(FLAGS):\n",
    "    flags_dict = FLAGS._flags()    \n",
@@ -3997,9 +3997,9 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\n",
-      "11/16/2018 11:03:05 - INFO - pytorch_pretrained_bert.modeling -   extracting archive file /Users/thomaswolf/.pytorch_pretrained_bert/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpaqgsm566\n",
-      "11/16/2018 11:03:08 - INFO - pytorch_pretrained_bert.modeling -   Model config {\n",
+      "11/16/2018 11:03:05 - INFO - pytorch_transformers.modeling_bert -   loading archive file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased.tar.gz from cache at /Users/thomaswolf/.pytorch_transformers/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba\n",
+      "11/16/2018 11:03:05 - INFO - pytorch_transformers.modeling_bert -   extracting archive file /Users/thomaswolf/.pytorch_transformers/9c41111e2de84547a463fd39217199738d1e3deb72d4fec4399e6e241983c6f0.ae3cef932725ca7a30cdcb93fc6e09150a55e2a130ec7af63975a16c153ae2ba to temp dir /var/folders/yx/cw8n_njx3js5jksyw_qlp8p00000gn/T/tmpaqgsm566\n",
+      "11/16/2018 11:03:08 - INFO - pytorch_transformers.modeling_bert -   Model config {\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
--- a/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb
+++ b/notebooks/Comparing-TF-and-PT-models-SQuAD.ipynb
@@ -86,7 +86,7 @@
    "spec.loader.exec_module(module)\n",
    "sys.modules['modeling_tensorflow'] = module\n",
    "\n",
-    "spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/run_squad.py')\n",
+    "spec = importlib.util.spec_from_file_location('*', original_tf_inplem_dir + '/run_bert_squad.py')\n",
    "module = importlib.util.module_from_spec(spec)\n",
    "spec.loader.exec_module(module)\n",
    "sys.modules['run_squad_tensorflow'] = module\n",
--- a/notebooks/Comparing-TF-and-PT-models.ipynb
+++ b/notebooks/Comparing-TF-and-PT-models.ipynb
@@ -342,7 +342,7 @@
   "outputs": [],
   "source": [
    "import extract_features\n",
-    "import pytorch_pretrained_bert as ppb\n",
+    "import pytorch_transformers as ppb\n",
    "from extract_features import *"
   ]
  },
@@ -375,8 +375,8 @@
     "name": "stderr",
     "output_type": "stream",
     "text": [
-      "11/15/2018 16:21:18 - INFO - pytorch_pretrained_bert.modeling -   loading archive file ../../google_models/uncased_L-12_H-768_A-12/\n",
-      "11/15/2018 16:21:18 - INFO - pytorch_pretrained_bert.modeling -   Model config {\n",
+      "11/15/2018 16:21:18 - INFO - pytorch_transformers.modeling_bert -   loading archive file ../../google_models/uncased_L-12_H-768_A-12/\n",
+      "11/15/2018 16:21:18 - INFO - pytorch_transformers.modeling_bert -   Model config {\n",
      "  \"attention_probs_dropout_prob\": 0.1,\n",
      "  \"hidden_act\": \"gelu\",\n",
      "  \"hidden_dropout_prob\": 0.1,\n",
--- a/pytorch_pretrained_bert/init.py
+++ b/pytorch_pretrained_bert/init.py
@@ -1,24 +0,0 @@
-__version__ = "0.6.2"
-from .tokenization import BertTokenizer, BasicTokenizer, WordpieceTokenizer
-from .tokenization_openai import OpenAIGPTTokenizer
-from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
-from .tokenization_gpt2 import GPT2Tokenizer
-
-from .modeling import (BertConfig, BertModel, BertForPreTraining,
-                       BertForMaskedLM, BertForNextSentencePrediction,
-                       BertForSequenceClassification, BertForMultipleChoice,
-                       BertForTokenClassification, BertForQuestionAnswering,
-                       load_tf_weights_in_bert)
-from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
-                              OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
-                              load_tf_weights_in_openai_gpt)
-from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
-                                  load_tf_weights_in_transfo_xl)
-from .modeling_gpt2 import (GPT2Config, GPT2Model,
-                            GPT2LMHeadModel, GPT2DoubleHeadsModel, GPT2MultipleChoiceHead,
-                            load_tf_weights_in_gpt2)
-
-from .optimization import BertAdam
-from .optimization_openai import OpenAIAdam
-
-from .file_utils import PYTORCH_PRETRAINED_BERT_CACHE, cached_path, WEIGHTS_NAME, CONFIG_NAME
--- a/pytorch_pretrained_bert/main.py
+++ b/pytorch_pretrained_bert/main.py
@@ -1,83 +0,0 @@
-# coding: utf8
-def main():
-    import sys
-    if (len(sys.argv) != 4 and len(sys.argv) != 5) or sys.argv[1] not in [
-        "convert_tf_checkpoint_to_pytorch",
-        "convert_openai_checkpoint",
-        "convert_transfo_xl_checkpoint",
-        "convert_gpt2_checkpoint",
-    ]:
-        print(
-        "Should be used as one of: \n"
-        ">> `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`, \n"
-        ">> `pytorch_pretrained_bert convert_openai_checkpoint OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`, \n"
-        ">> `pytorch_pretrained_bert convert_transfo_xl_checkpoint TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG]` or \n"
-        ">> `pytorch_pretrained_bert convert_gpt2_checkpoint TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG]`")
-    else:
-        if sys.argv[1] == "convert_tf_checkpoint_to_pytorch":
-            try:
-                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
-            except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions.")
-                raise
-
-            if len(sys.argv) != 5:
-                # pylint: disable=line-too-long
-                print("Should be used as `pytorch_pretrained_bert convert_tf_checkpoint_to_pytorch TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
-            else:
-                PYTORCH_DUMP_OUTPUT = sys.argv.pop()
-                TF_CONFIG = sys.argv.pop()
-                TF_CHECKPOINT = sys.argv.pop()
-                convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
-        elif sys.argv[1] == "convert_openai_checkpoint":
-            from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
-            OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
-            PYTORCH_DUMP_OUTPUT = sys.argv[3]
-            if len(sys.argv) == 5:
-                OPENAI_GPT_CONFIG = sys.argv[4]
-            else:
-                OPENAI_GPT_CONFIG = ""
-            convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
-                                                 OPENAI_GPT_CONFIG,
-                                                 PYTORCH_DUMP_OUTPUT)
-        elif sys.argv[1] == "convert_transfo_xl_checkpoint":
-            try:
-                from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
-            except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions.")
-                raise
-
-            if 'ckpt' in sys.argv[2].lower():
-                TF_CHECKPOINT = sys.argv[2]
-                TF_DATASET_FILE = ""
-            else:
-                TF_DATASET_FILE = sys.argv[2]
-                TF_CHECKPOINT = ""
-            PYTORCH_DUMP_OUTPUT = sys.argv[3]
-            if len(sys.argv) == 5:
-                TF_CONFIG = sys.argv[4]
-            else:
-                TF_CONFIG = ""
-            convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
-        else:
-            try:
-                from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
-            except ImportError:
-                print("pytorch_pretrained_bert can only be used from the commandline to convert TensorFlow models in PyTorch, "
-                    "In that case, it requires TensorFlow to be installed. Please see "
-                    "https://www.tensorflow.org/install/ for installation instructions.")
-                raise
-
-            TF_CHECKPOINT = sys.argv[2]
-            PYTORCH_DUMP_OUTPUT = sys.argv[3]
-            if len(sys.argv) == 5:
-                TF_CONFIG = sys.argv[4]
-            else:
-                TF_CONFIG = ""
-            convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
-if __name__ == '__main__':
-    main()
--- a/pytorch_pretrained_bert/modeling.py
+++ b/pytorch_pretrained_bert/modeling.py
--- a/pytorch_pretrained_bert/modeling_gpt2.py
+++ b/pytorch_pretrained_bert/modeling_gpt2.py
@@ -1,949 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch OpenAI GPT-2 model."""
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import collections
-import copy
-import json
-import logging
-import math
-import os
-import sys
-from io import open
-
-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss
-from torch.nn.parameter import Parameter
-
-from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
-from .modeling import BertLayerNorm as LayerNorm
-
-logger = logging.getLogger(__name__)
-
-PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
-                                "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"}
-PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
-                                 "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
-
-def prune_conv1d_layer(layer, index, dim=1):
-    """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
-        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
-        Return the pruned layer as a new layer with requires_grad=True.
-        Used to remove heads.
-    """
-    index = index.to(layer.weight.device)
-    W = layer.weight.index_select(dim, index).clone().detach()
-    if dim == 0:
-        b = layer.bias.clone().detach()
-    else:
-        b = layer.bias[index].clone().detach()
-    new_size = list(layer.weight.size())
-    new_size[dim] = len(index)
-    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
-    new_layer.weight.requires_grad = False
-    new_layer.weight.copy_(W.contiguous())
-    new_layer.weight.requires_grad = True
-    new_layer.bias.requires_grad = False
-    new_layer.bias.copy_(b.contiguous())
-    new_layer.bias.requires_grad = True
-    return new_layer
-
-
-def load_tf_weights_in_gpt2(model, gpt2_checkpoint_path):
-    """ Load tf checkpoints in a pytorch model
-    """
-    try:
-        import re
-        import numpy as np
-        import tensorflow as tf
-    except ImportError:
-        print("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
-            "https://www.tensorflow.org/install/ for installation instructions.")
-        raise
-    tf_path = os.path.abspath(gpt2_checkpoint_path)
-    print("Converting TensorFlow checkpoint from {}".format(tf_path))
-    # Load weights from TF model
-    init_vars = tf.train.list_variables(tf_path)
-    names = []
-    arrays = []
-    for name, shape in init_vars:
-        print("Loading TF weight {} with shape {}".format(name, shape))
-        array = tf.train.load_variable(tf_path, name)
-        names.append(name)
-        arrays.append(array.squeeze())
-
-    for name, array in zip(names, arrays):
-        name = name[6:]  # skip "model/"
-        name = name.split('/')
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
-                l = re.split(r'(\d+)', m_name)
-            else:
-                l = [m_name]
-            if l[0] == 'w' or l[0] == 'g':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'b':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'wpe' or l[0] == 'wte':
-                pointer = getattr(pointer, l[0])
-                pointer = getattr(pointer, 'weight')
-            else:
-                pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
-                pointer = pointer[num]
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        print("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
-def gelu(x):
-    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-
-
-class GPT2Config(object):
-    """Configuration class to store the configuration of a `GPT2Model`.
-    """
-
-    def __init__(
-        self,
-        vocab_size_or_config_json_file=50257,
-        n_special=0,
-        n_positions=1024,
-        n_ctx=1024,
-        n_embd=768,
-        n_layer=12,
-        n_head=12,
-        resid_pdrop=0.1,
-        embd_pdrop=0.1,
-        attn_pdrop=0.1,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        predict_special_tokens=True
-    ):
-        """Constructs GPT2Config.
-
-        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
-            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
-            n_positions: Number of positional embeddings.
-            n_ctx: Size of the causal mask (usually same as n_positions).
-            n_embd: Dimensionality of the embeddings and hidden states.
-            n_layer: Number of hidden layers in the Transformer encoder.
-            n_head: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            layer_norm_epsilon: epsilon to use in the layer norm layers
-            resid_pdrop: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attn_pdrop: The dropout ratio for the attention
-                probabilities.
-            embd_pdrop: The dropout ratio for the embeddings.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-            predict_special_tokens: should we predict special tokens (when the model has a LM head)
-        """
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.n_special = n_special
-            self.n_ctx = n_ctx
-            self.n_positions = n_positions
-            self.n_embd = n_embd
-            self.n_layer = n_layer
-            self.n_head = n_head
-            self.resid_pdrop = resid_pdrop
-            self.embd_pdrop = embd_pdrop
-            self.attn_pdrop = attn_pdrop
-            self.layer_norm_epsilon = layer_norm_epsilon
-            self.initializer_range = initializer_range
-            self.predict_special_tokens = predict_special_tokens
-        else:
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
-
-    @property
-    def total_tokens_embeddings(self):
-        return self.vocab_size + self.n_special
-
-    @classmethod
-    def from_dict(cls, json_object):
-        """Constructs a `GPT2Config` from a Python dictionary of parameters."""
-        config = GPT2Config(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            config.__dict__[key] = value
-        return config
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        """Constructs a `GPT2Config` from a json file of parameters."""
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        return cls.from_dict(json.loads(text))
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path):
-        """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
-            writer.write(self.to_json_string())
-
-
-class Conv1D(nn.Module):
-    def __init__(self, nf, nx):
-        super(Conv1D, self).__init__()
-        self.nf = nf
-        w = torch.empty(nx, nf)
-        nn.init.normal_(w, std=0.02)
-        self.weight = Parameter(w)
-        self.bias = Parameter(torch.zeros(nf))
-
-    def forward(self, x):
-        size_out = x.size()[:-1] + (self.nf,)
-        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-        x = x.view(*size_out)
-        return x
-
-
-class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
-        super(Attention, self).__init__()
-        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
-        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
-        assert n_state % config.n_head == 0
-        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
-        self.n_head = config.n_head
-        self.split_size = n_state
-        self.scale = scale
-
-        self.output_attentions = output_attentions
-        self.keep_multihead_output = keep_multihead_output
-        self.multihead_output = None
-
-        self.c_attn = Conv1D(n_state * 3, nx)
-        self.c_proj = Conv1D(n_state, nx)
-        self.attn_dropout = nn.Dropout(config.attn_pdrop)
-        self.resid_dropout = nn.Dropout(config.resid_pdrop)
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        mask = torch.ones(self.n_head, self.split_size // self.n_head)
-        for head in heads:
-            mask[head] = 0
-        mask = mask.view(-1).contiguous().eq(1)
-        index = torch.arange(len(mask))[mask].long()
-        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
-        # Prune conv1d layers
-        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
-        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
-        # Update hyper params
-        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
-        self.n_head = self.n_head - len(heads)
-
-    def _attn(self, q, k, v, head_mask=None):
-        w = torch.matmul(q, k)
-        if self.scale:
-            w = w / math.sqrt(v.size(-1))
-        nd, ns = w.size(-2), w.size(-1)
-        b = self.bias[:, :, ns-nd:ns, :ns]
-        w = w * b - 1e4 * (1 - b)
-
-        w = nn.Softmax(dim=-1)(w)
-        w = self.attn_dropout(w)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            w = w * head_mask
-
-        if self.output_attentions:
-            return w, torch.matmul(w, v)
-        return torch.matmul(w, v)
-
-    def merge_heads(self, x):
-        x = x.permute(0, 2, 1, 3).contiguous()
-        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
-        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
-
-    def split_heads(self, x, k=False):
-        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
-        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
-        if k:
-            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
-        else:
-            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
-
-    def forward(self, x, layer_past=None, head_mask=None):
-        x = self.c_attn(x)
-        query, key, value = x.split(self.split_size, dim=2)
-        query = self.split_heads(query)
-        key = self.split_heads(key, k=True)
-        value = self.split_heads(value)
-        if layer_past is not None:
-            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
-            key = torch.cat((past_key, key), dim=-1)
-            value = torch.cat((past_value, value), dim=-2)
-        present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
-
-        a = self._attn(query, key, value, head_mask)
-        if self.keep_multihead_output:
-            self.multihead_output = a
-            self.multihead_output.retain_grad()
-
-        if self.output_attentions:
-            attentions, a = a
-        a = self.merge_heads(a)
-        a = self.c_proj(a)
-        a = self.resid_dropout(a)
-        if self.output_attentions:
-            return attentions, a, present
-        return a, present
-
-
-class MLP(nn.Module):
-    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
-        super(MLP, self).__init__()
-        nx = config.n_embd
-        self.c_fc = Conv1D(n_state, nx)
-        self.c_proj = Conv1D(nx, n_state)
-        self.act = gelu
-        self.dropout = nn.Dropout(config.resid_pdrop)
-
-    def forward(self, x):
-        h = self.act(self.c_fc(x))
-        h2 = self.c_proj(h)
-        return self.dropout(h2)
-
-
-class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
-        super(Block, self).__init__()
-        nx = config.n_embd
-        self.output_attentions = output_attentions
-        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
-        self.attn = Attention(nx, n_ctx, config, scale, output_attentions, keep_multihead_output)
-        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
-        self.mlp = MLP(4 * nx, config)
-
-    def forward(self, x, layer_past=None, head_mask=None):
-        output_attn = self.attn(self.ln_1(x), layer_past=layer_past, head_mask=head_mask)
-        if self.output_attentions:
-            attentions, a, present = output_attn
-        else:
-            a, present = output_attn
-        x = x + a
-        m = self.mlp(self.ln_2(x))
-        x = x + m
-        if self.output_attentions:
-            return attentions, x, present
-        return x, present
-
-
-class GPT2LMHead(nn.Module):
-    """ Language Model Head for the transformer """
-
-    def __init__(self, model_embeddings_weights, config):
-        super(GPT2LMHead, self).__init__()
-        self.n_embd = config.n_embd
-        self.vocab_size = config.vocab_size
-        self.predict_special_tokens = config.predict_special_tokens
-        embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
-        self.set_embeddings_weights(model_embeddings_weights)
-
-    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
-        self.predict_special_tokens = predict_special_tokens
-        self.decoder.weight = model_embeddings_weights  # Tied weights
-
-    def forward(self, hidden_state):
-        lm_logits = self.decoder(hidden_state)
-        if not self.predict_special_tokens:
-            lm_logits = lm_logits[..., :self.vocab_size]
-        return lm_logits
-
-
-class GPT2MultipleChoiceHead(nn.Module):
-    """ Classifier Head for the transformer """
-
-    def __init__(self, config):
-        super(GPT2MultipleChoiceHead, self).__init__()
-        self.n_embd = config.n_embd
-        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
-        self.linear = nn.Linear(config.n_embd, 1)
-
-        nn.init.normal_(self.linear.weight, std=0.02)
-        nn.init.normal_(self.linear.bias, 0)
-
-    def forward(self, hidden_states, mc_token_ids):
-        # Classification logits
-        # hidden_state (bsz, num_choices, seq_length, hidden_size)
-        # mc_token_ids (bsz, num_choices)
-        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
-        # (bsz, num_choices, 1, hidden_size)
-        multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
-        # (bsz, num_choices, hidden_size)
-        multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
-        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
-        # (bsz, num_choices)
-        return multiple_choice_logits
-
-
-class GPT2PreTrainedModel(nn.Module):
-    """ An abstract class to handle weights initialization and
-        a simple interface for dowloading and loading pretrained models.
-    """
-
-    def __init__(self, config, *inputs, **kwargs):
-        super(GPT2PreTrainedModel, self).__init__()
-        if not isinstance(config, GPT2Config):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `GPT2Config`. "
-                "To create a model from a pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                )
-            )
-        self.config = config
-
-    def init_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """
-        Instantiate a GPT2PreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `gpt2`
-                - a path or url to a pretrained model archive containing:
-                    . `gpt2_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a GPT2Model instance
-                - a path or url to a pretrained model archive containing:
-                    . `gpt2_config.json` a configuration file for the model
-                    . a TensorFlow checkpoint with trained weights
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific GPT2 class
-        """
-        state_dict = kwargs.get('state_dict', None)
-        kwargs.pop('state_dict', None)
-        cache_dir = kwargs.get('cache_dir', None)
-        kwargs.pop('cache_dir', None)
-        from_tf = kwargs.get('from_tf', False)
-        kwargs.pop('from_tf', None)
-        num_special_tokens = kwargs.get('num_special_tokens', None)
-        kwargs.pop('num_special_tokens', None)
-
-        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
-            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        # redirect to the cache, if necessary
-        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained weights.".format(
-                        archive_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find file {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                        archive_file
-                    )
-                )
-            return None
-        try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                        config_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find file {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                        config_file
-                    )
-                )
-            return None
-        if resolved_archive_file == archive_file and resolved_config_file == config_file:
-            logger.info("loading weights file {}".format(archive_file))
-            logger.info("loading configuration file {}".format(config_file))
-        else:
-            logger.info("loading weights file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
-            logger.info("loading configuration file {} from cache at {}".format(
-                config_file, resolved_config_file))
-        # Load config
-        config = GPT2Config.from_json_file(resolved_config_file)
-        logger.info("Model config {}".format(config))
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file, map_location='cpu')
-        if from_tf:
-            # Directly load from a TensorFlow checkpoint (stored as NumPy array)
-            return load_tf_weights_in_gpt2(model, resolved_archive_file)
-
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if key.endswith(".g"):
-                new_key = key[:-2] + ".weight"
-            elif key.endswith(".b"):
-                new_key = key[:-2] + ".bias"
-            elif key.endswith(".w"):
-                new_key = key[:-2] + ".weight"
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, "_metadata", None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=""):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
-            )
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + ".")
-
-        start_model = model
-        if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()):
-            start_model = model.transformer
-        load(start_model, prefix="")
-
-        if len(missing_keys) > 0:
-            logger.info(
-                "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
-            )
-        if len(unexpected_keys) > 0:
-            logger.info(
-                "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
-            )
-        if len(error_msgs) > 0:
-            raise RuntimeError(
-                "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
-            )
-
-        # Add additional embeddings for special tokens if needed
-        # This step also make sure we are still sharing the output and input embeddings after loading weights
-        model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)
-        return model
-
-
-class GPT2Model(GPT2PreTrainedModel):
-    """OpenAI GPT-2 model ("Language Models are Unsupervised Multitask Learners").
-
-    GPT-2 use a single embedding matrix to store the word and special embeddings.
-    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
-
-    The embeddings are ordered as follow in the token embeddings matrice:
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
-
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-        total_tokens_embeddings = config.vocab_size + config.n_special
-    You should use the associate indices to index the embeddings.
-
-    Params:
-        `config`: a GPT2Config class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
-            (key and values in the attention blocks) to speed up sequential decoding
-            (this is the presents output of the model, cf. below).
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-    Outputs a tuple consisting of:
-        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
-            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
-            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
-        `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
-            torch.FloatTensors. They can be reused to speed up sequential decoding.
-
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-
-    config = modeling_gpt2.GPT2Config()
-
-    model = modeling_gpt2.GPT2Model(config)
-    hidden_states, presents = model(input_ids)
-    ```
-    """
-
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
-        super(GPT2Model, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.wte = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
-        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
-        self.drop = nn.Dropout(config.embd_pdrop)
-        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions,
-                                                        keep_multihead_output=keep_multihead_output)
-        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
-        self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
-
-        self.apply(self.init_weights)
-
-    def set_num_special_tokens(self, num_special_tokens):
-        " Update input embeddings with new embedding matrice if needed "
-        if self.config.n_special == num_special_tokens:
-            return
-        # Update config
-        self.config.n_special = num_special_tokens
-        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
-        old_embed = self.wte
-        self.wte = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
-        self.wte.to(old_embed.weight.device)
-        self.init_weights(self.wte)
-        # Copy word embeddings from the previous weights
-        self.wte.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
-
-    def prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        """
-        for layer, heads in heads_to_prune.items():
-            self.h[layer].attn.prune_heads(heads)
-
-    def get_multihead_outputs(self):
-        """ Gather all multi-head outputs.
-            Return: list (layers) of multihead module outputs with gradients
-        """
-        return [h.attn.multihead_output for h in self.h]
-
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
-        if past is None:
-            past_length = 0
-            past = [None] * len(self.h)
-        else:
-            past_length = past[0][0].size(-2)
-        if position_ids is None:
-            position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
-            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.n_layer
-
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_ids.size(-1))
-        position_ids = position_ids.view(-1, position_ids.size(-1))
-
-        inputs_embeds = self.wte(input_ids)
-        position_embeds = self.wpe(position_ids)
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
-            token_type_embeds = self.wte(token_type_ids)
-        else:
-            token_type_embeds = 0
-        hidden_states = inputs_embeds + position_embeds + token_type_embeds
-        hidden_states = self.drop(hidden_states)
-
-        output_shape = input_shape + (hidden_states.size(-1),)
-
-        presents = []
-        all_attentions = []
-        all_hidden_states = []
-        for i, (block, layer_past) in enumerate(zip(self.h, past)):
-            all_hidden_states.append(hidden_states.view(*output_shape))
-            outputs = block(hidden_states, layer_past, head_mask[i])
-            if self.output_attentions:
-                attentions, hidden_states, present = outputs
-                all_attentions.append(attentions)
-            else:
-                hidden_states, present = outputs
-            presents.append(present)
-        hidden_states = self.ln_f(hidden_states)
-        all_hidden_states.append(hidden_states.view(*output_shape))
-
-        if self.output_attentions:
-            return all_attentions, all_hidden_states, presents
-        return all_hidden_states, presents
-
-
-class GPT2LMHeadModel(GPT2PreTrainedModel):
-    """OpenAI GPT-2 model with a Language Modeling head ("Language Models are Unsupervised Multitask Learners").
-
-    Params:
-        `config`: a GPT2Config class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, config.vocab_size[
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
-            (key and values in the attention blocks) to speed up sequential decoding
-            (this is the presents output of the model, cf. below).
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-    Outputs:
-        if `lm_labels` is not `None`:
-            Outputs the language modeling loss.
-        else a tuple:
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, config.vocab_size]
-                (or more generally [d_1, ..., d_n, config.vocab_size] were d_1 ... d_n are the dimension of input_ids)
-            `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
-                torch.FloatTensors. They can be reused to speed up sequential decoding.
-
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-
-    config = modeling_gpt2.GPT2Config()
-
-    model = modeling_gpt2.GPT2LMHeadModel(config)
-    lm_logits, presents = model(input_ids)
-    ```
-    """
-
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
-        super(GPT2LMHeadModel, self).__init__(config)
-        self.transformer = GPT2Model(config, output_attentions=output_attentions,
-                                             keep_multihead_output=keep_multihead_output)
-        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
-        self.apply(self.init_weights)
-
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """ Update input and output embeddings with new embedding matrice
-            Make sure we are sharing the embeddings
-        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
-        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
-
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, past=None, head_mask=None):
-        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
-        if self.transformer.output_attentions:
-            all_attentions, hidden_states, presents = transformer_output
-        else:
-            hidden_states, presents = transformer_output
-        hidden_states = hidden_states[-1]
-
-        lm_logits = self.lm_head(hidden_states)
-        if lm_labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
-            return loss
-        if self.transformer.output_attentions:
-            return all_attentions, lm_logits, presents
-        return lm_logits, presents
-
-
-class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
-    """OpenAI GPT-2 model with a Language Modeling and a Multiple Choice head ("Language Models are Unsupervised Multitask Learners").
-
-    Params:
-        `config`: a GPT2Config class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
-            indices selected in the range [0, config.vocab_size[
-        `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
-            which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with indices selected in [-1, 0, ..., config.vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., config.vocab_size]
-        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_choices].
-        `past`: an optional list of torch.LongTensor that contains pre-computed hidden-states
-            (key and values in the attention blocks) to speed up sequential decoding
-            (this is the presents output of the model, cf. below).
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-    Outputs:
-        if `lm_labels` and `multiple_choice_labels` are not `None`:
-            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
-        else: a tuple with
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, config.vocab_size]
-            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
-            `presents`: a list of pre-computed hidden-states (key and values in each attention blocks) as
-                torch.FloatTensors. They can be reused to speed up sequential decoding.
-
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
-    mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
-
-    config = modeling_gpt2.GPT2Config()
-
-    model = modeling_gpt2.GPT2DoubleHeadsModel(config)
-    lm_logits, multiple_choice_logits, presents = model(input_ids, mc_token_ids)
-    ```
-    """
-
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
-        super(GPT2DoubleHeadsModel, self).__init__(config)
-        self.transformer = GPT2Model(config, output_attentions=output_attentions,
-                                             keep_multihead_output=keep_multihead_output)
-        self.lm_head = GPT2LMHead(self.transformer.wte.weight, config)
-        self.multiple_choice_head = GPT2MultipleChoiceHead(config)
-        self.apply(self.init_weights)
-
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """ Update input and output embeddings with new embedding matrice
-            Make sure we are sharing the embeddings
-        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
-        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.wte.weight, predict_special_tokens=predict_special_tokens)
-
-    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,
-                position_ids=None, past=None, head_mask=None):
-        transformer_output = self.transformer(input_ids, position_ids, token_type_ids, past, head_mask)
-        if self.transformer.output_attentions:
-            all_attentions, hidden_states, presents = transformer_output
-        else:
-            hidden_states, presents = transformer_output
-        hidden_states = hidden_states[-1]
-
-        lm_logits = self.lm_head(hidden_states)
-        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
-        losses = []
-        if lm_labels is not None:
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
-        if mc_labels is not None:
-            loss_fct = CrossEntropyLoss()
-            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
-        if losses:
-            return losses
-        if self.transformer.output_attentions:
-            return all_attentions, lm_logits, mc_logits, presents
-        return lm_logits, mc_logits, presents
--- a/pytorch_pretrained_bert/modeling_openai.py
+++ b/pytorch_pretrained_bert/modeling_openai.py
@@ -1,961 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch OpenAI GPT model."""
-
-from __future__ import absolute_import, division, print_function, unicode_literals
-
-import collections
-import copy
-import json
-import logging
-import math
-import os
-import sys
-from io import open
-
-import torch
-import torch.nn as nn
-from torch.nn import CrossEntropyLoss
-from torch.nn.parameter import Parameter
-
-from .file_utils import cached_path, CONFIG_NAME, WEIGHTS_NAME
-from .modeling import BertLayerNorm as LayerNorm
-from .modeling_gpt2 import prune_conv1d_layer
-
-logger = logging.getLogger(__name__)
-
-PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
-PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
-
-
-def load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path):
-    """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
-    """
-    import re
-    import numpy as np
-    print("Loading weights...")
-    names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
-    shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
-    offsets = np.cumsum([np.prod(shape) for shape in shapes])
-    init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
-    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
-    init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
-
-    # This was used when we had a single embedding matrix for positions and tokens
-    # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
-    # del init_params[1]
-    init_params = [arr.squeeze() for arr in init_params]
-
-    try:
-        assert model.tokens_embed.weight.shape == init_params[1].shape
-        assert model.positions_embed.weight.shape == init_params[0].shape
-    except AssertionError as e:
-        e.args += (model.tokens_embed.weight.shape, init_params[1].shape)
-        e.args += (model.positions_embed.weight.shape, init_params[0].shape)
-        raise
-
-    model.tokens_embed.weight.data = torch.from_numpy(init_params[1])
-    model.positions_embed.weight.data = torch.from_numpy(init_params[0])
-    names.pop(0)
-    # Pop position and token embedding arrays
-    init_params.pop(0)
-    init_params.pop(0)
-
-    for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):
-        name = name[6:]  # skip "model/"
-        assert name[-2:] == ":0"
-        name = name[:-2]
-        name = name.split('/')
-        pointer = model
-        for m_name in name:
-            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
-                l = re.split(r'(\d+)', m_name)
-            else:
-                l = [m_name]
-            if l[0] == 'g':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'b':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'w':
-                pointer = getattr(pointer, 'weight')
-            else:
-                pointer = getattr(pointer, l[0])
-            if len(l) >= 2:
-                num = int(l[1])
-                pointer = pointer[num]
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        try:
-            assert pointer.shape == array.shape
-        except AssertionError as e:
-            e.args += (pointer.shape, array.shape)
-            raise
-        print("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
-    return model
-
-
-def gelu(x):
-    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
-
-
-def swish(x):
-    return x * torch.sigmoid(x)
-
-
-ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
-
-
-class OpenAIGPTConfig(object):
-    """Configuration class to store the configuration of a `OpenAIGPTModel`.
-    """
-
-    def __init__(
-        self,
-        vocab_size_or_config_json_file=40478,
-        n_special=0,
-        n_positions=512,
-        n_ctx=512,
-        n_embd=768,
-        n_layer=12,
-        n_head=12,
-        afn="gelu",
-        resid_pdrop=0.1,
-        embd_pdrop=0.1,
-        attn_pdrop=0.1,
-        layer_norm_epsilon=1e-5,
-        initializer_range=0.02,
-        predict_special_tokens=True
-    ):
-        """Constructs OpenAIGPTConfig.
-
-        Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
-            n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
-            n_positions: Number of positional embeddings.
-            n_ctx: Size of the causal mask (usually same as n_positions).
-            n_embd: Dimensionality of the embeddings and hidden states.
-            n_layer: Number of hidden layers in the Transformer encoder.
-            n_head: Number of attention heads for each attention layer in
-                the Transformer encoder.
-            afn: The non-linear activation function (function or string) in the
-                encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
-            resid_pdrop: The dropout probabilitiy for all fully connected
-                layers in the embeddings, encoder, and pooler.
-            attn_pdrop: The dropout ratio for the attention
-                probabilities.
-            embd_pdrop: The dropout ratio for the embeddings.
-            layer_norm_epsilon: epsilon to use in the layer norm layers
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
-            predict_special_tokens: should we predict special tokens (when the model has a LM head)
-        """
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.n_special = n_special
-            self.n_ctx = n_ctx
-            self.n_positions = n_positions
-            self.n_embd = n_embd
-            self.n_layer = n_layer
-            self.n_head = n_head
-            self.afn = afn
-            self.resid_pdrop = resid_pdrop
-            self.embd_pdrop = embd_pdrop
-            self.attn_pdrop = attn_pdrop
-            self.layer_norm_epsilon = layer_norm_epsilon
-            self.initializer_range = initializer_range
-            self.predict_special_tokens = predict_special_tokens
-        else:
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
-
-    @property
-    def total_tokens_embeddings(self):
-        return self.vocab_size + self.n_special
-
-    @classmethod
-    def from_dict(cls, json_object):
-        """Constructs a `OpenAIGPTConfig` from a Python dictionary of parameters."""
-        config = OpenAIGPTConfig(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            config.__dict__[key] = value
-        return config
-
-    @classmethod
-    def from_json_file(cls, json_file):
-        """Constructs a `OpenAIGPTConfig` from a json file of parameters."""
-        with open(json_file, "r", encoding="utf-8") as reader:
-            text = reader.read()
-        return cls.from_dict(json.loads(text))
-
-    def __repr__(self):
-        return str(self.to_json_string())
-
-    def to_dict(self):
-        """Serializes this instance to a Python dictionary."""
-        output = copy.deepcopy(self.__dict__)
-        return output
-
-    def to_json_string(self):
-        """Serializes this instance to a JSON string."""
-        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
-
-    def to_json_file(self, json_file_path):
-        """ Save this instance to a json file."""
-        with open(json_file_path, "w", encoding='utf-8') as writer:
-            writer.write(self.to_json_string())
-
-
-class Conv1D(nn.Module):
-    def __init__(self, nf, rf, nx):
-        super(Conv1D, self).__init__()
-        self.rf = rf
-        self.nf = nf
-        if rf == 1:  # faster 1x1 conv
-            w = torch.empty(nx, nf)
-            nn.init.normal_(w, std=0.02)
-            self.weight = Parameter(w)
-            self.bias = Parameter(torch.zeros(nf))
-        else:  # was used to train LM
-            raise NotImplementedError
-
-    def forward(self, x):
-        if self.rf == 1:
-            size_out = x.size()[:-1] + (self.nf,)
-            x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
-            x = x.view(*size_out)
-        else:
-            raise NotImplementedError
-        return x
-
-
-class Attention(nn.Module):
-    def __init__(self, nx, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
-        super(Attention, self).__init__()
-        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
-        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
-        assert n_state % config.n_head == 0
-        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
-        self.n_head = config.n_head
-        self.split_size = n_state
-        self.scale = scale
-
-        self.output_attentions = output_attentions
-        self.keep_multihead_output = keep_multihead_output
-        self.multihead_output = None
-
-        self.c_attn = Conv1D(n_state * 3, 1, nx)
-        self.c_proj = Conv1D(n_state, 1, nx)
-        self.attn_dropout = nn.Dropout(config.attn_pdrop)
-        self.resid_dropout = nn.Dropout(config.resid_pdrop)
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        mask = torch.ones(self.n_head, self.split_size // self.n_head)
-        for head in heads:
-            mask[head] = 0
-        mask = mask.view(-1).contiguous().eq(1)
-        index = torch.arange(len(mask))[mask].long()
-        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
-        # Prune conv1d layers
-        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
-        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
-        # Update hyper params
-        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
-        self.n_head = self.n_head - len(heads)
-
-    def _attn(self, q, k, v, head_mask=None):
-        w = torch.matmul(q, k)
-        if self.scale:
-            w = w / math.sqrt(v.size(-1))
-        # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implem method: mask_attn_weights
-        # XD: self.b may be larger than w, so we need to crop it
-        b = self.bias[:, :, : w.size(-2), : w.size(-1)]
-        w = w * b + -1e9 * (1 - b)
-
-        w = nn.Softmax(dim=-1)(w)
-        w = self.attn_dropout(w)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            w = w * head_mask
-
-        if self.output_attentions:
-            return w, torch.matmul(w, v)
-        return torch.matmul(w, v)
-
-    def merge_heads(self, x):
-        x = x.permute(0, 2, 1, 3).contiguous()
-        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
-        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
-
-    def split_heads(self, x, k=False):
-        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
-        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
-        if k:
-            return x.permute(0, 2, 3, 1)
-        else:
-            return x.permute(0, 2, 1, 3)
-
-    def forward(self, x, head_mask=None):
-        x = self.c_attn(x)
-        query, key, value = x.split(self.split_size, dim=2)
-        query = self.split_heads(query)
-        key = self.split_heads(key, k=True)
-        value = self.split_heads(value)
-
-        a = self._attn(query, key, value, head_mask)
-        if self.keep_multihead_output:
-            self.multihead_output = a
-            self.multihead_output.retain_grad()
-
-        if self.output_attentions:
-            attentions, a = a
-        a = self.merge_heads(a)
-        a = self.c_proj(a)
-        a = self.resid_dropout(a)
-        if self.output_attentions:
-            return attentions, a
-        return a
-
-
-class MLP(nn.Module):
-    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
-        super(MLP, self).__init__()
-        nx = config.n_embd
-        self.c_fc = Conv1D(n_state, 1, nx)
-        self.c_proj = Conv1D(nx, 1, n_state)
-        self.act = ACT_FNS[config.afn]
-        self.dropout = nn.Dropout(config.resid_pdrop)
-
-    def forward(self, x):
-        h = self.act(self.c_fc(x))
-        h2 = self.c_proj(h)
-        return self.dropout(h2)
-
-
-class Block(nn.Module):
-    def __init__(self, n_ctx, config, scale=False, output_attentions=False, keep_multihead_output=False):
-        super(Block, self).__init__()
-        nx = config.n_embd
-        self.output_attentions = output_attentions
-        self.attn = Attention(nx, n_ctx, config, scale, output_attentions, keep_multihead_output)
-        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
-        self.mlp = MLP(4 * nx, config)
-        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
-
-    def forward(self, x, head_mask=None):
-        a = self.attn(x, head_mask=head_mask)
-        if self.output_attentions:
-            attentions, a = a
-        n = self.ln_1(x + a)
-        m = self.mlp(n)
-        h = self.ln_2(n + m)
-        if self.output_attentions:
-            return attentions, h
-        return h
-
-
-class OpenAIGPTLMHead(nn.Module):
-    """ Language Model Head for the transformer """
-
-    def __init__(self, model_embeddings_weights, config):
-        super(OpenAIGPTLMHead, self).__init__()
-        self.n_embd = config.n_embd
-        self.vocab_size = config.vocab_size
-        self.predict_special_tokens = config.predict_special_tokens
-        embed_shape = model_embeddings_weights.shape
-        self.decoder = nn.Linear(embed_shape[1], embed_shape[0], bias=False)
-        self.set_embeddings_weights(model_embeddings_weights)
-
-    def set_embeddings_weights(self, model_embeddings_weights, predict_special_tokens=True):
-        self.predict_special_tokens = predict_special_tokens
-        embed_shape = model_embeddings_weights.shape
-        self.decoder.weight = model_embeddings_weights  # Tied weights
-
-    def forward(self, hidden_state):
-        lm_logits = self.decoder(hidden_state)
-        if not self.predict_special_tokens:
-            lm_logits = lm_logits[..., :self.vocab_size]
-        return lm_logits
-
-
-class OpenAIGPTMultipleChoiceHead(nn.Module):
-    """ Classifier Head for the transformer """
-
-    def __init__(self, config):
-        super(OpenAIGPTMultipleChoiceHead, self).__init__()
-        self.n_embd = config.n_embd
-        self.dropout = nn.Dropout2d(config.resid_pdrop)  # To reproduce the noise_shape parameter of TF implementation
-        self.linear = nn.Linear(config.n_embd, 1)
-
-        nn.init.normal_(self.linear.weight, std=0.02)
-        nn.init.normal_(self.linear.bias, 0)
-
-    def forward(self, hidden_states, mc_token_ids):
-        # Classification logits
-        # hidden_state (bsz, num_choices, seq_length, hidden_size)
-        # mc_token_ids (bsz, num_choices)
-        mc_token_ids = mc_token_ids.unsqueeze(-1).unsqueeze(-1).expand(-1, -1, -1, hidden_states.size(-1))
-        # (bsz, num_choices, 1, hidden_size)
-        multiple_choice_h = hidden_states.gather(2, mc_token_ids).squeeze(2)
-        # (bsz, num_choices, hidden_size)
-        multiple_choice_h = self.dropout(multiple_choice_h.transpose(1, 2)).transpose(1, 2)
-        multiple_choice_logits = self.linear(multiple_choice_h).squeeze(-1)
-        # (bsz, num_choices)
-        return multiple_choice_logits
-
-
-class OpenAIGPTPreTrainedModel(nn.Module):
-    """ An abstract class to handle weights initialization and
-        a simple interface for dowloading and loading pretrained models.
-    """
-
-    def __init__(self, config, *inputs, **kwargs):
-        super(OpenAIGPTPreTrainedModel, self).__init__()
-        if not isinstance(config, OpenAIGPTConfig):
-            raise ValueError(
-                "Parameter config in `{}(config)` should be an instance of class `OpenAIGPTConfig`. "
-                "To create a model from a pretrained model use "
-                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
-                    self.__class__.__name__, self.__class__.__name__
-                )
-            )
-        self.config = config
-
-    def init_weights(self, module):
-        """ Initialize the weights.
-        """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, num_special_tokens=None, *inputs, **kwargs):
-        """
-        Instantiate a OpenAIGPTPreTrainedModel from a pre-trained model file or a pytorch state dict.
-        Download and cache the pre-trained model file if needed.
-
-        Params:
-            pretrained_model_name_or_path: either:
-                - a str with the name of a pre-trained model to load selected in the list of:
-                    . `openai-gpt`
-                - a path or url to a pretrained model archive containing:
-                    . `openai_gpt_config.json` a configuration file for the model
-                    . `pytorch_model.bin` a PyTorch dump of a OpenAIGPTModel instance
-                - a path or url to a pretrained model archive containing:
-                    . `openai-gpt-config.json` a configuration file for the model
-                    . a series of NumPy files containing OpenAI TensorFlow trained weights
-            from_tf: should we load the weights from a locally saved TensorFlow checkpoint
-            cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-            state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
-            *inputs, **kwargs: additional input for the specific OpenAI-GPT class
-        """
-        state_dict = kwargs.get('state_dict', None)
-        kwargs.pop('state_dict', None)
-        cache_dir = kwargs.get('cache_dir', None)
-        kwargs.pop('cache_dir', None)
-        from_tf = kwargs.get('from_tf', False)
-        kwargs.pop('from_tf', None)
-
-        if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-            archive_file = PRETRAINED_MODEL_ARCHIVE_MAP[pretrained_model_name_or_path]
-            config_file = PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
-        else:
-            archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
-            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        # redirect to the cache, if necessary
-        try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_MODEL_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained weights.".format(
-                        archive_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find file {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path, ", ".join(PRETRAINED_MODEL_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                        archive_file
-                    )
-                )
-            return None
-        try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_CONFIG_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
-                        config_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find file {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path, ", ".join(PRETRAINED_CONFIG_ARCHIVE_MAP.keys()), pretrained_model_name_or_path,
-                        config_file
-                    )
-                )
-            return None
-        if resolved_archive_file == archive_file and resolved_config_file == config_file:
-            logger.info("loading weights file {}".format(archive_file))
-            logger.info("loading configuration file {}".format(config_file))
-        else:
-            logger.info("loading weights file {} from cache at {}".format(
-                archive_file, resolved_archive_file))
-            logger.info("loading configuration file {} from cache at {}".format(
-                config_file, resolved_config_file))
-        # Load config
-        config = OpenAIGPTConfig.from_json_file(resolved_config_file)
-        logger.info("Model config {}".format(config))
-        # Instantiate model.
-        model = cls(config, *inputs, **kwargs)
-        if state_dict is None and not from_tf:
-            state_dict = torch.load(resolved_archive_file, map_location='cpu')
-        if from_tf:
-            # Directly load from a TensorFlow checkpoint (stored as NumPy array)
-            return load_tf_weights_in_openai_gpt(model, resolved_archive_file)
-
-        old_keys = []
-        new_keys = []
-        for key in state_dict.keys():
-            new_key = None
-            if key.endswith(".g"):
-                new_key = key[:-2] + ".weight"
-            elif key.endswith(".b"):
-                new_key = key[:-2] + ".bias"
-            elif key.endswith(".w"):
-                new_key = key[:-2] + ".weight"
-            if new_key:
-                old_keys.append(key)
-                new_keys.append(new_key)
-        for old_key, new_key in zip(old_keys, new_keys):
-            state_dict[new_key] = state_dict.pop(old_key)
-
-        missing_keys = []
-        unexpected_keys = []
-        error_msgs = []
-        # copy state_dict so _load_from_state_dict can modify it
-        metadata = getattr(state_dict, "_metadata", None)
-        state_dict = state_dict.copy()
-        if metadata is not None:
-            state_dict._metadata = metadata
-
-        def load(module, prefix=""):
-            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
-            module._load_from_state_dict(
-                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
-            )
-            for name, child in module._modules.items():
-                if child is not None:
-                    load(child, prefix + name + ".")
-
-        start_model = model
-        if hasattr(model, "transformer") and all(not s.startswith('transformer.') for s in state_dict.keys()):
-            start_model = model.transformer
-        load(start_model, prefix="")
-
-        if len(missing_keys) > 0:
-            logger.info(
-                "Weights of {} not initialized from pretrained model: {}".format(model.__class__.__name__, missing_keys)
-            )
-        if len(unexpected_keys) > 0:
-            logger.info(
-                "Weights from pretrained model not used in {}: {}".format(model.__class__.__name__, unexpected_keys)
-            )
-        if len(error_msgs) > 0:
-            raise RuntimeError(
-                "Error(s) in loading state_dict for {}:\n\t{}".format(model.__class__.__name__, "\n\t".join(error_msgs))
-            )
-
-        # Add additional embeddings for special tokens if needed
-        # This step also make sure we are still sharing the output and input embeddings after loading weights
-        model.set_num_special_tokens(num_special_tokens if num_special_tokens is not None else config.n_special)
-        return model
-
-
-class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
-    """OpenAI GPT model ("Improving Language Understanding by Generative Pre-Training").
-
-    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
-    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
-
-    The embeddings are ordered as follow in the token embeddings matrice:
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
-
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-        total_tokens_embeddings = config.vocab_size + config.n_special
-    You should use the associate indices to index the embeddings.
-
-    Params:
-        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-    Outputs:
-        `hidden_states`: a list of all the encoded-hidden-states in the model (length of the list: number of layers + 1 for the output of the embeddings)
-            as torch.FloatTensor of size [batch_size, sequence_length, hidden_size]
-            (or more generally [d_1, ..., d_n, hidden_size] were d_1 ... d_n are the dimension of input_ids)
-
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-
-    config = modeling_openai.OpenAIGPTConfig()
-
-    model = modeling_openai.OpenAIGPTModel(config)
-    hidden_states = model(input_ids)
-    ```
-    """
-
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
-        super(OpenAIGPTModel, self).__init__(config)
-        self.output_attentions = output_attentions
-        self.tokens_embed = nn.Embedding(config.total_tokens_embeddings, config.n_embd)
-        self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
-        self.drop = nn.Dropout(config.embd_pdrop)
-        block = Block(config.n_ctx, config, scale=True, output_attentions=output_attentions,
-                                                        keep_multihead_output=keep_multihead_output)
-        self.h = nn.ModuleList([copy.deepcopy(block) for _ in range(config.n_layer)])
-
-        self.apply(self.init_weights)
-
-    def set_num_special_tokens(self, num_special_tokens):
-        " Update input embeddings with new embedding matrice if needed "
-        if self.config.n_special == num_special_tokens:
-            return
-        # Update config
-        self.config.n_special = num_special_tokens
-        # Build new embeddings and initialize all new embeddings (in particular the special tokens)
-        old_embed = self.tokens_embed
-        self.tokens_embed = nn.Embedding(self.config.total_tokens_embeddings, self.config.n_embd)
-        self.tokens_embed.to(old_embed.weight.device)
-        self.init_weights(self.tokens_embed)
-        # Copy word embeddings from the previous weights
-        self.tokens_embed.weight.data[:self.config.vocab_size, :] = old_embed.weight.data[:self.config.vocab_size, :]
-
-    def prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-        """
-        for layer, heads in heads_to_prune.items():
-            self.h[layer].attn.prune_heads(heads)
-
-    def get_multihead_outputs(self):
-        """ Gather all multi-head outputs.
-            Return: list (layers) of multihead module outputs with gradients
-        """
-        return [h.attn.multihead_output for h in self.h]
-
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
-        if position_ids is None:
-            # This was used when we had a single embedding matrice from position and token embeddings
-            # start = self.config.vocab_size + self.config.n_special
-            # end = start + input_ids.size(-1)
-            # position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
-            position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
-            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # head_mask has shape n_layer x batch x n_heads x N x N
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand_as(self.config.n_layer, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.n_layer
-
-        input_shape = input_ids.size()
-        input_ids = input_ids.view(-1, input_ids.size(-1))
-        position_ids = position_ids.view(-1, position_ids.size(-1))
-
-        inputs_embeds = self.tokens_embed(input_ids)
-        position_embeds = self.positions_embed(position_ids)
-        if token_type_ids is not None:
-            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
-            token_type_embeds = self.tokens_embed(token_type_ids)
-        else:
-            token_type_embeds = 0
-        hidden_states = inputs_embeds + position_embeds + token_type_embeds
-        hidden_states = self.drop(hidden_states)
-
-        output_shape = input_shape + (hidden_states.size(-1),)
-
-        all_attentions = []
-        all_hidden_states = [hidden_states.view(*output_shape)]
-        for i, block in enumerate(self.h):
-            outputs = block(hidden_states, head_mask[i])
-            if self.output_attentions:
-                attentions, hidden_states = outputs
-                all_attentions.append(attentions)
-            else:
-                hidden_states = outputs
-            all_hidden_states.append(hidden_states.view(*output_shape))
-
-        if self.output_attentions:
-            return all_attentions, all_hidden_states
-        return all_hidden_states
-
-
-class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
-    """OpenAI GPT model with a Language Modeling head ("Improving Language Understanding by Generative Pre-Training").
-
-    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
-    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
-
-    The embeddings are ordered as follow in the token embeddings matrice:
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
-
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-        total_tokens_embeddings = config.vocab_size + config.n_special
-    You should use the associate indices to index the embeddings.
-
-    Params:
-        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, sequence_length] (or more generally [d_1, ..., d_n, sequence_length]
-            were d_1 ... d_n are arbitrary dimensions) with the word BPE token indices selected in the range [0, total_tokens_embeddings[
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, sequence_length]
-            with indices selected in [-1, 0, ..., vocab_size]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., vocab_size]
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-    Outputs:
-        if `lm_labels` is not `None`:
-            Outputs the language modeling loss.
-        else:
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, sequence_length, total_tokens_embeddings]
-                (or more generally [d_1, ..., d_n, total_tokens_embeddings] were d_1 ... d_n are the dimension of input_ids)
-
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[31, 51, 99], [15, 5, 0]])
-
-    config = modeling_openai.OpenAIGPTConfig()
-
-    model = modeling_openai.OpenAIGPTLMHeadModel(config)
-    lm_logits = model(input_ids)
-    ```
-    """
-
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
-        super(OpenAIGPTLMHeadModel, self).__init__(config)
-        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions,
-                                             keep_multihead_output=keep_multihead_output)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
-        self.apply(self.init_weights)
-
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """ Update input and output embeddings with new embedding matrice
-            Make sure we are sharing the embeddings
-        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
-        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
-
-    def forward(self, input_ids, position_ids=None, token_type_ids=None, lm_labels=None, head_mask=None):
-        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
-        if self.transformer.output_attentions:
-            all_attentions, hidden_states = hidden_states
-        hidden_states = hidden_states[-1]
-
-        lm_logits = self.lm_head(hidden_states)
-        if lm_labels is not None:
-            # Shift so that tokens < n predict n
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
-                            shift_labels.view(-1))
-            return loss
-        if self.transformer.output_attentions:
-            return all_attentions, lm_logits
-        return lm_logits
-
-
-class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
-    """OpenAI GPT model with a Language Modeling and a Multiple Choice head ("Improving Language Understanding by Generative Pre-Training").
-
-    OpenAI GPT use a single embedding matrix to store the word and special embeddings.
-    Special tokens embeddings are additional tokens that are not pre-trained: [SEP], [CLS]...
-    Special tokens need to be trained during the fine-tuning if you use them.
-    The number of special embeddings can be controled using the `set_num_special_tokens(num_special_tokens)` function.
-
-    The embeddings are ordered as follow in the token embeddings matrice:
-        [0,                                                         ----------------------
-         ...                                                        -> word embeddings
-         config.vocab_size - 1,                                     ______________________
-         config.vocab_size,
-         ...                                                        -> special embeddings
-         config.vocab_size + config.n_special - 1]                  ______________________
-
-    where total_tokens_embeddings can be obtained as config.total_tokens_embeddings and is:
-        total_tokens_embeddings = config.vocab_size + config.n_special
-    You should use the associate indices to index the embeddings.
-
-    Params:
-        `config`: a OpenAIGPTConfig class instance with the configuration to build a new model
-        `output_attentions`: If True, also output attentions weights computed by the model at each layer. Default: False
-        `keep_multihead_output`: If True, saves output of the multi-head attention module with its gradient.
-            This can be used to compute head importance metrics. Default: False
-
-    Inputs:
-        `input_ids`: a torch.LongTensor of shape [batch_size, num_choices, sequence_length] with the BPE token
-            indices selected in the range [0, total_tokens_embeddings[
-        `mc_token_ids`: a torch.LongTensor of shape [batch_size, num_choices] with the index of the token from
-            which we should take the hidden state to feed the multiple choice classifier (usually last token of the sequence)
-        `position_ids`: an optional torch.LongTensor with the same shape as input_ids
-            with the position indices (selected in the range [0, config.n_positions - 1[.
-        `token_type_ids`: an optional torch.LongTensor with the same shape as input_ids
-            You can use it to add a third type of embedding to each input token in the sequence
-            (the previous two being the word and position embeddings).
-            The input, position and token_type embeddings are summed inside the Transformer before the first
-            self-attention block.
-        `lm_labels`: optional language modeling labels: torch.LongTensor of shape [batch_size, num_choices, sequence_length]
-            with indices selected in [-1, 0, ..., total_tokens_embeddings]. All labels set to -1 are ignored (masked), the loss
-            is only computed for the labels set in [0, ..., total_tokens_embeddings]
-        `multiple_choice_labels`: optional multiple choice labels: torch.LongTensor of shape [batch_size]
-            with indices selected in [0, ..., num_choices].
-        `head_mask`: an optional torch.Tensor of shape [num_heads] or [num_layers, num_heads] with indices between 0 and 1.
-            It's a mask to be used to nullify some heads of the transformer. 1.0 => head is fully masked, 0.0 => head is not masked.
-
-    Outputs:
-        if `lm_labels` and `multiple_choice_labels` are not `None`:
-            Outputs a tuple of losses with the language modeling loss and the multiple choice loss.
-        else: a tuple with
-            `lm_logits`: the language modeling logits as a torch.FloatTensor of size [batch_size, num_choices, sequence_length, total_tokens_embeddings]
-            `multiple_choice_logits`: the multiple choice logits as a torch.FloatTensor of size [batch_size, num_choices]
-
-    Example usage:
-    ```python
-    # Already been converted into BPE token ids
-    input_ids = torch.LongTensor([[[31, 51, 99], [15, 5, 0]]])  # (bsz, number of choice, seq length)
-    mc_token_ids = torch.LongTensor([[2], [1]]) # (bsz, number of choice)
-
-    config = modeling_openai.OpenAIGPTConfig()
-
-    model = modeling_openai.OpenAIGPTDoubleHeadsModel(config)
-    lm_logits, multiple_choice_logits = model(input_ids, mc_token_ids)
-    ```
-    """
-
-    def __init__(self, config, output_attentions=False, keep_multihead_output=False):
-        super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
-        self.transformer = OpenAIGPTModel(config, output_attentions=output_attentions,
-                                             keep_multihead_output=keep_multihead_output)
-        self.lm_head = OpenAIGPTLMHead(self.transformer.tokens_embed.weight, config)
-        self.multiple_choice_head = OpenAIGPTMultipleChoiceHead(config)
-        self.apply(self.init_weights)
-
-    def set_num_special_tokens(self, num_special_tokens, predict_special_tokens=True):
-        """ Update input and output embeddings with new embedding matrice
-            Make sure we are sharing the embeddings
-        """
-        self.config.predict_special_tokens = self.transformer.config.predict_special_tokens = predict_special_tokens
-        self.transformer.set_num_special_tokens(num_special_tokens)
-        self.lm_head.set_embeddings_weights(self.transformer.tokens_embed.weight, predict_special_tokens=predict_special_tokens)
-
-    def forward(self, input_ids, mc_token_ids, lm_labels=None, mc_labels=None, token_type_ids=None,
-                position_ids=None, head_mask=None):
-        hidden_states = self.transformer(input_ids, position_ids, token_type_ids, head_mask)
-        if self.transformer.output_attentions:
-            all_attentions, hidden_states = hidden_states
-        hidden_states = hidden_states[-1]
-
-        lm_logits = self.lm_head(hidden_states)
-        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids)
-        losses = []
-        if lm_labels is not None:
-            shift_logits = lm_logits[..., :-1, :].contiguous()
-            shift_labels = lm_labels[..., 1:].contiguous()
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            losses.append(loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)))
-        if mc_labels is not None:
-            loss_fct = CrossEntropyLoss()
-            losses.append(loss_fct(mc_logits.view(-1, mc_logits.size(-1)), mc_labels.view(-1)))
-        if losses:
-            return losses
-        if self.transformer.output_attentions:
-            return all_attentions, lm_logits, mc_logits
-        return lm_logits, mc_logits
--- a/pytorch_pretrained_bert/optimization.py
+++ b/pytorch_pretrained_bert/optimization.py
@@ -1,301 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch optimization for BERT model."""
-
-import math
-import torch
-from torch.optim import Optimizer
-from torch.optim.optimizer import required
-from torch.nn.utils import clip_grad_norm_
-import logging
-import abc
-import sys
-
-logger = logging.getLogger(__name__)
-
-
-if sys.version_info >= (3, 4):
-    ABC = abc.ABC
-else:
-    ABC = abc.ABCMeta('ABC', (), {})
-
-
-class _LRSchedule(ABC):
-    """ Parent of all LRSchedules here. """
-    warn_t_total = False        # is set to True for schedules where progressing beyond t_total steps doesn't make sense
-    def __init__(self, warmup=0.002, t_total=-1, **kw):
-        """
-        :param warmup:  what fraction of t_total steps will be used for linear warmup
-        :param t_total: how many training steps (updates) are planned
-        :param kw:
-        """
-        super(_LRSchedule, self).__init__(**kw)
-        if t_total < 0:
-            logger.warning("t_total value of {} results in schedule not being applied".format(t_total))
-        if not 0.0 <= warmup < 1.0 and not warmup == -1:
-            raise ValueError("Invalid warmup: {} - should be in [0.0, 1.0[ or -1".format(warmup))
-        warmup = max(warmup, 0.)
-        self.warmup, self.t_total = float(warmup), float(t_total)
-        self.warned_for_t_total_at_progress = -1
-
-    def get_lr(self, step, nowarn=False):
-        """
-        :param step:    which of t_total steps we're on
-        :param nowarn:  set to True to suppress warning regarding training beyond specified 't_total' steps
-        :return:        learning rate multiplier for current update
-        """
-        if self.t_total < 0:
-            return 1.
-        progress = float(step) / self.t_total
-        ret = self.get_lr_(progress)
-        # warning for exceeding t_total (only active with warmup_linear
-        if not nowarn and self.warn_t_total and progress > 1. and progress > self.warned_for_t_total_at_progress:
-            logger.warning(
-                "Training beyond specified 't_total'. Learning rate multiplier set to {}. Please set 't_total' of {} correctly."
-                    .format(ret, self.__class__.__name__))
-            self.warned_for_t_total_at_progress = progress
-        # end warning
-        return ret
-
-    @abc.abstractmethod
-    def get_lr_(self, progress):
-        """
-        :param progress:    value between 0 and 1 (unless going beyond t_total steps) specifying training progress
-        :return:            learning rate multiplier for current update
-        """
-        return 1.
-
-
-class ConstantLR(_LRSchedule):
-    def get_lr_(self, progress):
-        return 1.
-
-
-class WarmupCosineSchedule(_LRSchedule):
-    """
-    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    Decreases learning rate from 1. to 0. over remaining `1 - warmup` steps following a cosine curve.
-    If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
-    """
-    warn_t_total = True
-    def __init__(self, warmup=0.002, t_total=-1, cycles=.5, **kw):
-        """
-        :param warmup:      see LRSchedule
-        :param t_total:     see LRSchedule
-        :param cycles:      number of cycles. Default: 0.5, corresponding to cosine decay from 1. at progress==warmup and 0 at progress==1.
-        :param kw:
-        """
-        super(WarmupCosineSchedule, self).__init__(warmup=warmup, t_total=t_total, **kw)
-        self.cycles = cycles
-
-    def get_lr_(self, progress):
-        if progress < self.warmup:
-            return progress / self.warmup
-        else:
-            progress = (progress - self.warmup) / (1 - self.warmup)   # progress after warmup
-            return 0.5 * (1. + math.cos(math.pi * self.cycles * 2 * progress))
-
-
-class WarmupCosineWithHardRestartsSchedule(WarmupCosineSchedule):
-    """
-    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
-    learning rate (with hard restarts).
-    """
-    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
-        super(WarmupCosineWithHardRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
-        assert(cycles >= 1.)
-
-    def get_lr_(self, progress):
-        if progress < self.warmup:
-            return progress / self.warmup
-        else:
-            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
-            ret = 0.5 * (1. + math.cos(math.pi * ((self.cycles * progress) % 1)))
-            return ret
-
-
-class WarmupCosineWithWarmupRestartsSchedule(WarmupCosineWithHardRestartsSchedule):
-    """
-    All training progress is divided in `cycles` (default=1.) parts of equal length.
-    Every part follows a schedule with the first `warmup` fraction of the training steps linearly increasing from 0. to 1.,
-    followed by a learning rate decreasing from 1. to 0. following a cosine curve.
-    """
-    def __init__(self, warmup=0.002, t_total=-1, cycles=1., **kw):
-        assert(warmup * cycles < 1.)
-        warmup = warmup * cycles if warmup >= 0 else warmup
-        super(WarmupCosineWithWarmupRestartsSchedule, self).__init__(warmup=warmup, t_total=t_total, cycles=cycles, **kw)
-
-    def get_lr_(self, progress):
-        progress = progress * self.cycles % 1.
-        if progress < self.warmup:
-            return progress / self.warmup
-        else:
-            progress = (progress - self.warmup) / (1 - self.warmup)     # progress after warmup
-            ret = 0.5 * (1. + math.cos(math.pi * progress))
-            return ret
-
-
-class WarmupConstantSchedule(_LRSchedule):
-    """
-    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    Keeps learning rate equal to 1. after warmup.
-    """
-    def get_lr_(self, progress):
-        if progress < self.warmup:
-            return progress / self.warmup
-        return 1.
-
-
-class WarmupLinearSchedule(_LRSchedule):
-    """
-    Linearly increases learning rate from 0 to 1 over `warmup` fraction of training steps.
-    Linearly decreases learning rate from 1. to 0. over remaining `1 - warmup` steps.
-    """
-    warn_t_total = True
-    def get_lr_(self, progress):
-        if progress < self.warmup:
-            return progress / self.warmup
-        return max((progress - 1.) / (self.warmup - 1.), 0.)
-
-
-SCHEDULES = {
-    None:       ConstantLR,
-    "none":     ConstantLR,
-    "warmup_cosine": WarmupCosineSchedule,
-    "warmup_constant": WarmupConstantSchedule,
-    "warmup_linear": WarmupLinearSchedule
-}
-
-
-class BertAdam(Optimizer):
-    """Implements BERT version of Adam algorithm with weight decay fix.
-    Params:
-        lr: learning rate
-        warmup: portion of t_total for the warmup, -1  means no warmup. Default: -1
-        t_total: total number of training steps for the learning
-            rate schedule, -1  means constant learning rate of 1. (no warmup regardless of warmup setting). Default: -1
-        schedule: schedule to use for the warmup (see above).
-            Can be `'warmup_linear'`, `'warmup_constant'`, `'warmup_cosine'`, `'none'`, `None` or a `_LRSchedule` object (see below).
-            If `None` or `'none'`, learning rate is always kept constant.
-            Default : `'warmup_linear'`
-        betas: Adams betas. Default: (0.9, 0.999)
-        e: Adams epsilon. Default: 1e-6
-        weight_decay: Weight decay. Default: 0.01
-        max_grad_norm: Maximum norm for the gradients (-1 means no clipping). Default: 1.0
-    """
-    def __init__(self, params, lr=required, warmup=-1, t_total=-1, schedule='warmup_linear',
-                 betas=(0.9, 0.999), e=1e-6, weight_decay=0.01, max_grad_norm=1.0, **kwargs):
-        if lr is not required and lr < 0.0:
-            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
-            raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {} - should be in [0.0, 1.0[".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {} - should be in [0.0, 1.0[".format(betas[1]))
-        if not e >= 0.0:
-            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
-        # initialize schedule object
-        if not isinstance(schedule, _LRSchedule):
-            schedule_type = SCHEDULES[schedule]
-            schedule = schedule_type(warmup=warmup, t_total=t_total)
-        else:
-            if warmup != -1 or t_total != -1:
-                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
-                               "Please specify custom warmup and t_total in _LRSchedule object.")
-        defaults = dict(lr=lr, schedule=schedule,
-                        betas=betas, e=e, weight_decay=weight_decay,
-                        max_grad_norm=max_grad_norm)
-        super(BertAdam, self).__init__(params, defaults)
-
-    def get_lr(self):
-        lr = []
-        for group in self.param_groups:
-            for p in group['params']:
-                state = self.state[p]
-                if len(state) == 0:
-                    return [0]
-                lr_scheduled = group['lr']
-                lr_scheduled *= group['schedule'].get_lr(state['step'])
-                lr.append(lr_scheduled)
-        return lr
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['next_m'] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state['next_v'] = torch.zeros_like(p.data)
-
-                next_m, next_v = state['next_m'], state['next_v']
-                beta1, beta2 = group['betas']
-
-                # Add grad clipping
-                if group['max_grad_norm'] > 0:
-                    clip_grad_norm_(p, group['max_grad_norm'])
-
-                # Decay the first and second moment running average coefficient
-                # In-place operations to update the averages at the same time
-                next_m.mul_(beta1).add_(1 - beta1, grad)
-                next_v.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                update = next_m / (next_v.sqrt() + group['e'])
-
-                # Just adding the square of the weights to the loss function is *not*
-                # the correct way of using L2 regularization/weight decay with Adam,
-                # since that will interact with the m and v parameters in strange ways.
-                #
-                # Instead we want to decay the weights in a manner that doesn't interact
-                # with the m/v parameters. This is equivalent to adding the square
-                # of the weights to the loss with plain (non-momentum) SGD.
-                if group['weight_decay'] > 0.0:
-                    update += group['weight_decay'] * p.data
-
-                lr_scheduled = group['lr']
-                lr_scheduled *= group['schedule'].get_lr(state['step'])
-
-                update_with_lr = lr_scheduled * update
-                p.data.add_(-update_with_lr)
-
-                state['step'] += 1
-
-                # step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
-                # No bias correction
-                # bias_correction1 = 1 - beta1 ** state['step']
-                # bias_correction2 = 1 - beta2 ** state['step']
-
-        return loss
--- a/pytorch_pretrained_bert/optimization_openai.py
+++ b/pytorch_pretrained_bert/optimization_openai.py
@@ -1,127 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""PyTorch optimization for OpenAI GPT model."""
-
-import math
-import torch
-from torch.optim import Optimizer
-from torch.optim.optimizer import required
-from torch.nn.utils import clip_grad_norm_
-import logging
-from .optimization import SCHEDULES, _LRSchedule, WarmupCosineWithWarmupRestartsSchedule, \
-    WarmupCosineWithHardRestartsSchedule, WarmupCosineSchedule, WarmupLinearSchedule, WarmupConstantSchedule
-
-logger = logging.getLogger(__name__)
-
-
-class OpenAIAdam(Optimizer):
-    """Implements Open AI version of Adam algorithm with weight decay fix.
-    """
-    def __init__(self, params, lr=required, schedule='warmup_linear', warmup=-1, t_total=-1,
-                 betas=(0.9, 0.999), e=1e-8, weight_decay=0,
-                 vector_l2=False, max_grad_norm=-1, **kwargs):
-        if lr is not required and lr < 0.0:
-            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
-        if not isinstance(schedule, _LRSchedule) and schedule not in SCHEDULES:
-            raise ValueError("Invalid schedule parameter: {}".format(schedule))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError("Invalid beta parameter at index 0: {} - should be in [0.0, 1.0[".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError("Invalid beta parameter at index 1: {} - should be in [0.0, 1.0[".format(betas[1]))
-        if not e >= 0.0:
-            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(e))
-        # initialize schedule object
-        if not isinstance(schedule, _LRSchedule):
-            schedule_type = SCHEDULES[schedule]
-            schedule = schedule_type(warmup=warmup, t_total=t_total)
-        else:
-            if warmup != -1 or t_total != -1:
-                logger.warning("warmup and t_total on the optimizer are ineffective when _LRSchedule object is provided as schedule. "
-                               "Please specify custom warmup and t_total in _LRSchedule object.")
-        defaults = dict(lr=lr, schedule=schedule,
-                        betas=betas, e=e, weight_decay=weight_decay, vector_l2=vector_l2,
-                        max_grad_norm=max_grad_norm)
-        super(OpenAIAdam, self).__init__(params, defaults)
-
-    def get_lr(self):
-        lr = []
-        for group in self.param_groups:
-            for p in group['params']:
-                state = self.state[p]
-                if len(state) == 0:
-                    return [0]
-                lr_scheduled = group['lr']
-                lr_scheduled *= group['schedule'].get_lr(state['step'])
-                lr.append(lr_scheduled)
-        return lr
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p.data)
-
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
-
-                state['step'] += 1
-
-                # Add grad clipping
-                if group['max_grad_norm'] > 0:
-                    clip_grad_norm_(p, group['max_grad_norm'])
-
-                # Decay the first and second moment running average coefficient
-                exp_avg.mul_(beta1).add_(1 - beta1, grad)
-                exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
-                denom = exp_avg_sq.sqrt().add_(group['e'])
-
-                bias_correction1 = 1 - beta1 ** state['step']
-                bias_correction2 = 1 - beta2 ** state['step']
-
-                lr_scheduled = group['lr']
-                lr_scheduled *= group['schedule'].get_lr(state['step'])
-
-                step_size = lr_scheduled * math.sqrt(bias_correction2) / bias_correction1
-
-                p.data.addcdiv_(-step_size, exp_avg, denom)
-
-                # Add weight decay at the end (fixed version)
-                if (len(p.size()) > 1 or group['vector_l2']) and group['weight_decay'] > 0:
-                    p.data.add_(-lr_scheduled * group['weight_decay'], p.data)
-
-        return loss
--- a/pytorch_pretrained_bert/tokenization_gpt2.py
+++ b/pytorch_pretrained_bert/tokenization_gpt2.py
@@ -1,316 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
-
-import sys
-import json
-import logging
-import os
-import regex as re
-from io import open
-
-try:
-    from functools import lru_cache
-except ImportError:
-    # Just a dummy decorator to get the checks to run on python2
-    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
-    def lru_cache():
-        return lambda func: func
-
-from .file_utils import cached_path
-
-logger = logging.getLogger(__name__)
-
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
-    'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
-}
-PRETRAINED_MERGES_ARCHIVE_MAP = {
-    'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
-    'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
-}
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
-    'gpt2': 1024,
-}
-VOCAB_NAME = 'vocab.json'
-MERGES_NAME = 'merges.txt'
-SPECIAL_TOKENS_NAME = 'special_tokens.txt'
-
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a signficant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    _chr = unichr if sys.version_info[0] == 2 else chr
-    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8+n)
-            n += 1
-    cs = [_chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-class GPT2Tokenizer(object):
-    """
-    GPT-2 BPE tokenizer. Peculiarities:
-        - Byte-level BPE
-    """
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a GPT2Tokenizer from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
-            special_tokens_file = None
-        else:
-            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
-            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
-            if not os.path.exists(special_tokens_file):
-                special_tokens_file = None
-            else:
-                logger.info("loading special tokens file {}".format(special_tokens_file))
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download vocabulary.".format(
-                        vocab_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                        pretrained_model_name_or_path,
-                        vocab_file, merges_file))
-            return None
-        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-            logger.info("loading merges file {}".format(merges_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-            logger.info("loading merges file {} from cache at {}".format(
-                merges_file, resolved_merges_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        if special_tokens_file and 'special_tokens' not in kwargs:
-            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-        else:
-            special_tokens = kwargs.pop('special_tokens', [])
-        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
-        return tokenizer
-
-    def __init__(self, vocab_file, merges_file, errors='replace', special_tokens=None, max_len=None):
-        self.max_len = max_len if max_len is not None else int(1e12)
-        self.encoder = json.load(open(vocab_file))
-        self.decoder = {v:k for k,v in self.encoder.items()}
-        self.errors = errors # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v:k for k, v in self.byte_encoder.items()}
-        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
-        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-
-        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
-
-        self.special_tokens = {}
-        self.special_tokens_decoder = {}
-        self.set_special_tokens(special_tokens)
-
-    def __len__(self):
-        return len(self.encoder) + len(self.special_tokens)
-
-    def set_special_tokens(self, special_tokens):
-        """ Add a list of additional tokens to the encoder.
-            The additional tokens are indexed starting from the last index of the
-            current vocabulary in the order of the `special_tokens` list.
-        """
-        if not special_tokens:
-            self.special_tokens = {}
-            self.special_tokens_decoder = {}
-            return
-        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
-        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
-        logger.info("Special tokens {}".format(self.special_tokens))
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
-                    new_word.extend(word[i:])
-                    break
-
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = ' '.join(word)
-        self.cache[token] = word
-        return word
-
-    def tokenize(self, text):
-        """ Tokenize a string. """
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            if sys.version_info[0] == 2:
-                token = ''.join(self.byte_encoder[ord(b)] for b in token)
-            else:
-                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
-            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
-        return bpe_tokens
-
-    def convert_tokens_to_ids(self, tokens):
-        """ Converts a sequence of tokens into ids using the vocab. """
-        ids = []
-        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            else:
-                return self.encoder.get(tokens, 0)
-        for token in tokens:
-            if token in self.special_tokens:
-                ids.append(self.special_tokens[token])
-            else:
-                ids.append(self.encoder.get(token, 0))
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this OpenAI GPT model ({} > {}). Running this"
-                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
-            )
-        return ids
-
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        """Converts a sequence of ids in BPE tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            if i in self.special_tokens_decoder:
-                if not skip_special_tokens:
-                    tokens.append(self.special_tokens_decoder[i])
-            else:
-                tokens.append(self.decoder[i])
-        return tokens
-
-    def encode(self, text):
-        return self.convert_tokens_to_ids(self.tokenize(text))
-
-    def decode(self, tokens, skip_special_tokens=False, clean_up_tokenization_spaces=True):
-        text = ''.join(self.convert_ids_to_tokens(tokens, skip_special_tokens=skip_special_tokens))
-        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
-        if clean_up_tokenization_spaces:
-            text = text.replace('<unk>', '')
-            text = text.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
-                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
-                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
-        return text
-
-    def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(vocab_path):
-            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
-            return
-        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
-        merge_file = os.path.join(vocab_path, MERGES_NAME)
-        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
-
-        with open(vocab_file, 'w', encoding='utf-8') as f:
-            f.write(json.dumps(self.encoder, ensure_ascii=False))
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write(u'#version: 0.2\n')
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
-                    index = token_index
-                writer.write(' '.join(bpe_tokens) + u'\n')
-                index += 1
-
-        index = len(self.encoder)
-        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
-            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
-                    index = token_index
-                writer.write(token + u'\n')
-                index += 1
-
-        return vocab_file, merge_file, special_tokens_file
--- a/pytorch_pretrained_bert/tokenization_openai.py
+++ b/pytorch_pretrained_bert/tokenization_openai.py
@@ -1,318 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
-
-import json
-import logging
-import os
-import re
-import sys
-from io import open
-
-from tqdm import tqdm
-
-from .file_utils import cached_path
-from .tokenization import BasicTokenizer
-
-logger = logging.getLogger(__name__)
-
-PRETRAINED_VOCAB_ARCHIVE_MAP = {
-    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-vocab.json",
-}
-PRETRAINED_MERGES_ARCHIVE_MAP = {
-    'openai-gpt': "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-merges.txt",
-}
-PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP = {
-    'openai-gpt': 512,
-}
-VOCAB_NAME = 'vocab.json'
-MERGES_NAME = 'merges.txt'
-SPECIAL_TOKENS_NAME = 'special_tokens.txt'
-
-def get_pairs(word):
-    """
-    Return set of symbol pairs in a word.
-    word is represented as tuple of symbols (symbols being variable-length strings)
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-def text_standardize(text):
-    """
-    fixes some issues the spacy tokenizer had on books corpus
-    also does some whitespace standardization
-    """
-    text = text.replace('—', '-')
-    text = text.replace('–', '-')
-    text = text.replace('―', '-')
-    text = text.replace('…', '...')
-    text = text.replace('´', "'")
-    text = re.sub(r'''(-+|~+|!+|"+|;+|\?+|\++|,+|\)+|\(+|\\+|\/+|\*+|\[+|\]+|}+|{+|\|+|_+)''', r' \1 ', text)
-    text = re.sub(r'\s*\n\s*', ' \n ', text)
-    text = re.sub(r'[^\S\n]+', ' ', text)
-    return text.strip()
-
-class OpenAIGPTTokenizer(object):
-    """
-    BPE tokenizer. Peculiarities:
-        - lower case all inputs
-        - uses SpaCy tokenizer and ftfy for pre-BPE tokenization if they are installed, fallback to BERT's BasicTokenizer if not.
-        - argument special_tokens and function set_special_tokens:
-            can be used to add additional symbols (ex: "__classify__") to a vocabulary.
-    """
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
-        """
-        Instantiate a PreTrainedBertModel from a pre-trained model file.
-        Download and cache the pre-trained model file if needed.
-        """
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-            vocab_file = PRETRAINED_VOCAB_ARCHIVE_MAP[pretrained_model_name_or_path]
-            merges_file = PRETRAINED_MERGES_ARCHIVE_MAP[pretrained_model_name_or_path]
-            special_tokens_file = None
-        else:
-            vocab_file = os.path.join(pretrained_model_name_or_path, VOCAB_NAME)
-            merges_file = os.path.join(pretrained_model_name_or_path, MERGES_NAME)
-            special_tokens_file = os.path.join(pretrained_model_name_or_path, SPECIAL_TOKENS_NAME)
-            if not os.path.exists(special_tokens_file):
-                special_tokens_file = None
-            else:
-                logger.info("loading special tokens file {}".format(special_tokens_file))
-        # redirect to the cache, if necessary
-        try:
-            resolved_vocab_file = cached_path(vocab_file, cache_dir=cache_dir)
-            resolved_merges_file = cached_path(merges_file, cache_dir=cache_dir)
-        except EnvironmentError:
-            if pretrained_model_name_or_path in PRETRAINED_VOCAB_ARCHIVE_MAP:
-                logger.error(
-                    "Couldn't reach server at '{}' to download vocabulary.".format(
-                        vocab_file))
-            else:
-                logger.error(
-                    "Model name '{}' was not found in model name list ({}). "
-                    "We assumed '{}' was a path or url but couldn't find files {} and {} "
-                    "at this path or url.".format(
-                        pretrained_model_name_or_path,
-                        ', '.join(PRETRAINED_VOCAB_ARCHIVE_MAP.keys()),
-                        pretrained_model_name_or_path,
-                        vocab_file, merges_file))
-            return None
-        if resolved_vocab_file == vocab_file and resolved_merges_file == merges_file:
-            logger.info("loading vocabulary file {}".format(vocab_file))
-            logger.info("loading merges file {}".format(merges_file))
-        else:
-            logger.info("loading vocabulary file {} from cache at {}".format(
-                vocab_file, resolved_vocab_file))
-            logger.info("loading merges file {} from cache at {}".format(
-                merges_file, resolved_merges_file))
-        if pretrained_model_name_or_path in PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP:
-            # if we're using a pretrained model, ensure the tokenizer wont index sequences longer
-            # than the number of positional embeddings
-            max_len = PRETRAINED_VOCAB_POSITIONAL_EMBEDDINGS_SIZE_MAP[pretrained_model_name_or_path]
-            kwargs['max_len'] = min(kwargs.get('max_len', int(1e12)), max_len)
-        # Instantiate tokenizer.
-        if special_tokens_file and 'special_tokens' not in kwargs:
-            special_tokens = open(special_tokens_file, encoding='utf-8').read().split('\n')[:-1]
-        else:
-            special_tokens = kwargs.pop('special_tokens', [])
-        tokenizer = cls(resolved_vocab_file, resolved_merges_file, special_tokens=special_tokens, *inputs, **kwargs)
-        return tokenizer
-
-    def __init__(self, vocab_file, merges_file, special_tokens=None, max_len=None):
-        try:
-            import ftfy
-            import spacy
-            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
-            self.fix_text = ftfy.fix_text
-        except ImportError:
-            logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
-            self.nlp = BasicTokenizer(do_lower_case=True,
-                                      never_split=special_tokens if special_tokens is not None else [])
-            self.fix_text = None
-
-        self.max_len = max_len if max_len is not None else int(1e12)
-        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
-        self.decoder = {v:k for k,v in self.encoder.items()}
-        merges = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
-        merges = [tuple(merge.split()) for merge in merges]
-        self.bpe_ranks = dict(zip(merges, range(len(merges))))
-        self.cache = {}
-        self.special_tokens = {}
-        self.special_tokens_decoder = {}
-        self.set_special_tokens(special_tokens)
-
-    def __len__(self):
-        return len(self.encoder) + len(self.special_tokens)
-
-    def set_special_tokens(self, special_tokens):
-        """ Add a list of additional tokens to the encoder.
-            The additional tokens are indexed starting from the last index of the
-            current vocabulary in the order of the `special_tokens` list.
-        """
-        if not special_tokens:
-            self.special_tokens = {}
-            self.special_tokens_decoder = {}
-            return
-        self.special_tokens = dict((tok, len(self.encoder) + i) for i, tok in enumerate(special_tokens))
-        self.special_tokens_decoder = {v:k for k, v in self.special_tokens.items()}
-        if self.fix_text is None:
-            # Using BERT's BasicTokenizer: we can update the tokenizer
-            self.nlp.never_split = special_tokens
-        logger.info("Special tokens {}".format(self.special_tokens))
-
-    def bpe(self, token):
-        word = tuple(token[:-1]) + (token[-1] + '</w>',)
-        if token in self.cache:
-            return self.cache[token]
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token+'</w>'
-
-        while True:
-            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
-                    new_word.extend(word[i:])
-                    break
-
-                if word[i] == first and i < len(word)-1 and word[i+1] == second:
-                    new_word.append(first+second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = ' '.join(word)
-        if word == '\n  </w>':
-            word = '\n</w>'
-        self.cache[token] = word
-        return word
-
-    def tokenize(self, text):
-        """ Tokenize a string. """
-        split_tokens = []
-        if self.fix_text is None:
-            # Using BERT's BasicTokenizer
-            text = self.nlp.tokenize(text)
-            for token in text:
-                split_tokens.extend([t for t in self.bpe(token).split(' ')])
-        else:
-            # Using SpaCy & ftfy (original tokenization process of OpenAI GPT)
-            text = self.nlp(text_standardize(self.fix_text(text)))
-            for token in text:
-                split_tokens.extend([t for t in self.bpe(token.text.lower()).split(' ')])
-        return split_tokens
-
-    def convert_tokens_to_ids(self, tokens):
-        """ Converts a sequence of tokens into ids using the vocab. """
-        ids = []
-        if isinstance(tokens, str) or (sys.version_info[0] == 2 and isinstance(tokens, unicode)):
-            if tokens in self.special_tokens:
-                return self.special_tokens[tokens]
-            else:
-                return self.encoder.get(tokens, 0)
-        for token in tokens:
-            if token in self.special_tokens:
-                ids.append(self.special_tokens[token])
-            else:
-                ids.append(self.encoder.get(token, 0))
-        if len(ids) > self.max_len:
-            logger.warning(
-                "Token indices sequence length is longer than the specified maximum "
-                " sequence length for this OpenAI GPT model ({} > {}). Running this"
-                " sequence through the model will result in indexing errors".format(len(ids), self.max_len)
-            )
-        return ids
-
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        """Converts a sequence of ids in BPE tokens using the vocab."""
-        tokens = []
-        for i in ids:
-            if i in self.special_tokens_decoder:
-                if not skip_special_tokens:
-                    tokens.append(self.special_tokens_decoder[i])
-            else:
-                tokens.append(self.decoder[i])
-        return tokens
-
-    def encode(self, text):
-        return self.convert_tokens_to_ids(self.tokenize(text))
-
-    def decode(self, ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
-        """Converts a sequence of ids in a string."""
-        tokens = self.convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)
-        out_string = ''.join(tokens).replace('</w>', ' ').strip()
-        if clean_up_tokenization_spaces:
-            out_string = out_string.replace('<unk>', '')
-            out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
-                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
-                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
-        return out_string
-
-    def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary and merge files to a directory."""
-        if not os.path.isdir(vocab_path):
-            logger.error("Vocabulary path ({}) should be a directory".format(vocab_path))
-            return
-        vocab_file = os.path.join(vocab_path, VOCAB_NAME)
-        merge_file = os.path.join(vocab_path, MERGES_NAME)
-        special_tokens_file = os.path.join(vocab_path, SPECIAL_TOKENS_NAME)
-
-        with open(vocab_file, 'w', encoding='utf-8') as f:
-            f.write(json.dumps(self.encoder, ensure_ascii=False))
-
-        index = 0
-        with open(merge_file, "w", encoding="utf-8") as writer:
-            writer.write(u'#version: 0.2\n')
-            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
-                    index = token_index
-                writer.write(' '.join(bpe_tokens) + u'\n')
-                index += 1
-
-        index = len(self.encoder)
-        with open(special_tokens_file, 'w', encoding='utf-8') as writer:
-            for token, token_index in sorted(self.special_tokens.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving special tokens vocabulary to {}: BPE indices are not consecutive."
-                                   " Please check that the tokenizer is not corrupted!".format(special_tokens_file))
-                    index = token_index
-                writer.write(token + u'\n')
-                index += 1
-
-        return vocab_file, merge_file, special_tokens_file
--- a/pytorch_transformers/init.py
+++ b/pytorch_transformers/init.py
@@ -0,0 +1,42 @@
+__version__ = "0.7.0"
+from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .tokenization_openai import OpenAIGPTTokenizer
+from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
+from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
+from .tokenization_xlm import XLMTokenizer
+from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization)
+
+from .modeling_bert import (BertConfig, BertModel, BertForPreTraining,
+                       BertForMaskedLM, BertForNextSentencePrediction,
+                       BertForSequenceClassification, BertForMultipleChoice,
+                       BertForTokenClassification, BertForQuestionAnswering,
+                       load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                       BERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
+from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
+                              OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
+                              load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                              OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
+                                  load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_gpt2 import (GPT2Config, GPT2Model,
+                            GPT2LMHeadModel, GPT2DoubleHeadsModel,
+                            load_tf_weights_in_gpt2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                            GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_xlnet import (XLNetConfig,
+                             XLNetPreTrainedModel, XLNetModel, XLNetLMHeadModel,
+                             XLNetForSequenceClassification, XLNetForQuestionAnswering,
+                             load_tf_weights_in_xlnet, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                             XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_xlm import (XLMConfig, XLMModel,
+                           XLMWithLMHeadModel, XLMForSequenceClassification,
+                           XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                           XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
+                          PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
+
+from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
+                           WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
+
+from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
--- a/pytorch_transformers/main.py
+++ b/pytorch_transformers/main.py
@@ -0,0 +1,128 @@
+# coding: utf8
+def main():
+    import sys
+    if (len(sys.argv) < 4 or len(sys.argv) > 6) or sys.argv[1] not in ["bert", "gpt", "transfo_xl", "gpt2", "xlnet", "xlm"]:
+        print(
+        "Should be used as one of: \n"
+        ">> pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
+        ">> pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
+        ">> pytorch_transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
+        ">> pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
+        ">> pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME] or \n"
+        ">> pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT")
+    else:
+        if sys.argv[1] == "bert":
+            try:
+                from .convert_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            if len(sys.argv) != 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+            else:
+                PYTORCH_DUMP_OUTPUT = sys.argv.pop()
+                TF_CONFIG = sys.argv.pop()
+                TF_CHECKPOINT = sys.argv.pop()
+                convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+        elif sys.argv[1] == "gpt":
+            from .convert_openai_checkpoint_to_pytorch import convert_openai_checkpoint_to_pytorch
+            if len(sys.argv) < 4 or len(sys.argv) > 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG]`")
+            else:
+                OPENAI_GPT_CHECKPOINT_FOLDER_PATH = sys.argv[2]
+                PYTORCH_DUMP_OUTPUT = sys.argv[3]
+                if len(sys.argv) == 5:
+                    OPENAI_GPT_CONFIG = sys.argv[4]
+                else:
+                    OPENAI_GPT_CONFIG = ""
+                convert_openai_checkpoint_to_pytorch(OPENAI_GPT_CHECKPOINT_FOLDER_PATH,
+                                                    OPENAI_GPT_CONFIG,
+                                                    PYTORCH_DUMP_OUTPUT)
+        elif sys.argv[1] == "transfo_xl":
+            try:
+                from .convert_transfo_xl_checkpoint_to_pytorch import convert_transfo_xl_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+            if len(sys.argv) < 4 or len(sys.argv) > 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_transformers transfo_xl TF_CHECKPOINT/TF_DATASET_FILE PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+            else:
+                if 'ckpt' in sys.argv[2].lower():
+                    TF_CHECKPOINT = sys.argv[2]
+                    TF_DATASET_FILE = ""
+                else:
+                    TF_DATASET_FILE = sys.argv[2]
+                    TF_CHECKPOINT = ""
+                PYTORCH_DUMP_OUTPUT = sys.argv[3]
+                if len(sys.argv) == 5:
+                    TF_CONFIG = sys.argv[4]
+                else:
+                    TF_CONFIG = ""
+                convert_transfo_xl_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT, TF_DATASET_FILE)
+        elif sys.argv[1] == "gpt2":
+            try:
+                from .convert_gpt2_checkpoint_to_pytorch import convert_gpt2_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            if len(sys.argv) < 4 or len(sys.argv) > 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [TF_CONFIG]`")
+            else:
+                TF_CHECKPOINT = sys.argv[2]
+                PYTORCH_DUMP_OUTPUT = sys.argv[3]
+                if len(sys.argv) == 5:
+                    TF_CONFIG = sys.argv[4]
+                else:
+                    TF_CONFIG = ""
+                convert_gpt2_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+        elif sys.argv[1] == "xlnet":
+            try:
+                from .convert_xlnet_checkpoint_to_pytorch import convert_xlnet_checkpoint_to_pytorch
+            except ImportError:
+                print("pytorch_transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
+            if len(sys.argv) < 5 or len(sys.argv) > 6:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_transformers xlnet TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT [FINETUNING_TASK_NAME]`")
+            else:
+                TF_CHECKPOINT = sys.argv[2]
+                TF_CONFIG = sys.argv[3]
+                PYTORCH_DUMP_OUTPUT = sys.argv[4]
+                if len(sys.argv) == 6:
+                    FINETUNING_TASK = sys.argv[5]
+                else:
+                    FINETUNING_TASK = None
+
+                convert_xlnet_checkpoint_to_pytorch(TF_CHECKPOINT,
+                                                    TF_CONFIG,
+                                                    PYTORCH_DUMP_OUTPUT,
+                                                    FINETUNING_TASK)
+        elif sys.argv[1] == "xlm":
+            from .convert_xlm_checkpoint_to_pytorch import convert_xlm_checkpoint_to_pytorch
+
+            if len(sys.argv) != 4:
+                # pylint: disable=line-too-long
+                print("Should be used as `pytorch_transformers xlm XLM_CHECKPOINT_PATH PYTORCH_DUMP_OUTPUT`")
+            else:
+                XLM_CHECKPOINT_PATH = sys.argv[2]
+                PYTORCH_DUMP_OUTPUT = sys.argv[3]
+
+                convert_xlm_checkpoint_to_pytorch(XLM_CHECKPOINT_PATH, PYTORCH_DUMP_OUTPUT)
+
+if __name__ == '__main__':
+    main()
--- a/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_gpt2_checkpoint_to_pytorch.py
@@ -21,11 +21,14 @@ from io import open

 import torch

-from pytorch_pretrained_bert.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
+from pytorch_transformers.modeling_gpt2 import (CONFIG_NAME, WEIGHTS_NAME,
                                                     GPT2Config,
                                                     GPT2Model,
                                                     load_tf_weights_in_gpt2)

+import logging
+logging.basicConfig(level=logging.INFO)
+

 def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, pytorch_dump_folder_path):
    # Construct model
@@ -36,7 +39,7 @@ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, p
    model = GPT2Model(config)

    # Load weights from numpy
-    load_tf_weights_in_gpt2(model, gpt2_checkpoint_path)
+    load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path)

    # Save pytorch-model
    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
--- a/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_openai_checkpoint_to_pytorch.py
@@ -21,11 +21,14 @@ from io import open

 import torch

-from pytorch_pretrained_bert.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
+from pytorch_transformers.modeling_openai import (CONFIG_NAME, WEIGHTS_NAME,
                                                     OpenAIGPTConfig,
                                                     OpenAIGPTModel,
                                                     load_tf_weights_in_openai_gpt)

+import logging
+logging.basicConfig(level=logging.INFO)
+

 def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_config_file, pytorch_dump_folder_path):
    # Construct model
@@ -36,7 +39,7 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c
    model = OpenAIGPTModel(config)

    # Load weights from numpy
-    load_tf_weights_in_openai_gpt(model, openai_checkpoint_folder_path)
+    load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path)

    # Save pytorch-model
    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
--- a/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
+++ b/pytorch_pretrained_bert/convert_pytorch_checkpoint_to_tf.py
--- a/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_tf_checkpoint_to_pytorch.py
@@ -18,14 +18,13 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function

-import os
-import re
 import argparse
-import tensorflow as tf
 import torch
-import numpy as np

-from pytorch_pretrained_bert.modeling import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+from pytorch_transformers.modeling_bert import BertConfig, BertForPreTraining, load_tf_weights_in_bert
+
+import logging
+logging.basicConfig(level=logging.INFO)

 def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_path):
    # Initialise PyTorch model
@@ -34,7 +33,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytor
    model = BertForPreTraining(config)

    # Load weights from tf checkpoint
-    load_tf_weights_in_bert(model, tf_checkpoint_path)
+    load_tf_weights_in_bert(model, config, tf_checkpoint_path)

    # Save pytorch-model
    print("Save PyTorch model to {}".format(pytorch_dump_path))
--- a/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_pretrained_bert/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -23,20 +23,22 @@ from io import open

 import torch

-import pytorch_pretrained_bert.tokenization_transfo_xl as data_utils
-from pytorch_pretrained_bert.modeling_transfo_xl import (CONFIG_NAME,
+import pytorch_transformers.tokenization_transfo_xl as data_utils
+from pytorch_transformers.modeling_transfo_xl import (CONFIG_NAME,
                                                         WEIGHTS_NAME,
                                                         TransfoXLConfig,
                                                         TransfoXLLMHeadModel,
                                                         load_tf_weights_in_transfo_xl)
-from pytorch_pretrained_bert.tokenization_transfo_xl import (CORPUS_NAME,
-                                                             VOCAB_NAME)
+from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)

 if sys.version_info[0] == 2:
    import cPickle as pickle
 else:
    import pickle

+import logging
+logging.basicConfig(level=logging.INFO)
+
 # We do this to be able to load python 2 datasets pickles
 # See e.g. https://stackoverflow.com/questions/2121874/python-pickling-after-changing-a-modules-directory/2121918#2121918
 data_utils.Vocab = data_utils.TransfoXLTokenizer
@@ -53,7 +55,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
        with open(transfo_xl_dataset_file, "rb") as fp:
            corpus = pickle.load(fp, encoding="latin1")
        # Save vocabulary and dataset cache as Dictionaries (should be better than pickles for the long-term)
-        pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_NAME
+        pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' + VOCAB_FILES_NAMES['pretrained_vocab_file']
        print("Save vocabulary to {}".format(pytorch_vocab_dump_path))
        corpus_vocab_dict = corpus.vocab.__dict__
        torch.save(corpus_vocab_dict, pytorch_vocab_dump_path)
--- a/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlm_checkpoint_to_pytorch.py
@@ -0,0 +1,75 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert OpenAI GPT checkpoint."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import json
+from io import open
+
+import torch
+import numpy
+
+from pytorch_transformers.modeling_utils import CONFIG_NAME, WEIGHTS_NAME
+from pytorch_transformers.tokenization_xlm import VOCAB_FILES_NAMES
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+def convert_xlm_checkpoint_to_pytorch(xlm_checkpoint_path, pytorch_dump_folder_path):
+    # Load checkpoint
+    chkpt = torch.load(xlm_checkpoint_path, map_location='cpu')
+
+    model = chkpt['model']
+
+    config = chkpt['params']
+    config = dict((n, v) for n, v in config.items() if not isinstance(v, (torch.Tensor, numpy.ndarray)))
+
+    vocab = chkpt['dico_word2id']
+    vocab = dict((s + '</w>' if s.find('@@') == -1 and i > 13 else s.replace('@@', ''), i) for s, i in vocab.items())
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = pytorch_dump_folder_path + '/' + WEIGHTS_NAME
+    pytorch_config_dump_path = pytorch_dump_folder_path + '/' + CONFIG_NAME
+    pytorch_vocab_dump_path = pytorch_dump_folder_path + '/' +  VOCAB_FILES_NAMES['vocab_file']
+
+    print("Save PyTorch model to {}".format(pytorch_weights_dump_path))
+    torch.save(model, pytorch_weights_dump_path)
+
+    print("Save configuration file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(json.dumps(config, indent=2) + "\n")
+
+    print("Save vocab file to {}".format(pytorch_config_dump_path))
+    with open(pytorch_vocab_dump_path, "w", encoding="utf-8") as f:
+        f.write(json.dumps(vocab, indent=2) + "\n")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--xlm_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the official PyTorch dump.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_xlm_checkpoint_to_pytorch(args.xlm_checkpoint_path, args.pytorch_dump_folder_path)
--- a/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
@@ -0,0 +1,104 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert BERT checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import argparse
+import torch
+
+from pytorch_transformers.modeling_xlnet import (CONFIG_NAME, WEIGHTS_NAME,
+                                                    XLNetConfig,
+                                                    XLNetLMHeadModel, XLNetForQuestionAnswering,
+                                                    XLNetForSequenceClassification,
+                                                    load_tf_weights_in_xlnet)
+
+GLUE_TASKS_NUM_LABELS = {
+    "cola": 2,
+    "mnli": 3,
+    "mrpc": 2,
+    "sst-2": 2,
+    "sts-b": 1,
+    "qqp": 2,
+    "qnli": 2,
+    "rte": 2,
+    "wnli": 2,
+}
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+def convert_xlnet_checkpoint_to_pytorch(tf_checkpoint_path, bert_config_file, pytorch_dump_folder_path, finetuning_task=None):
+    # Initialise PyTorch model
+    config = XLNetConfig.from_json_file(bert_config_file)
+
+    finetuning_task = finetuning_task.lower() if finetuning_task is not None else ""
+    if finetuning_task in GLUE_TASKS_NUM_LABELS:
+        print("Building PyTorch XLNetForSequenceClassification model from configuration: {}".format(str(config)))
+        config.finetuning_task = finetuning_task
+        config.num_labels = GLUE_TASKS_NUM_LABELS[finetuning_task]
+        model = XLNetForSequenceClassification(config)
+    elif 'squad' in finetuning_task:
+        config.finetuning_task = finetuning_task
+        model = XLNetForQuestionAnswering(config)
+    else:
+        model = XLNetLMHeadModel(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_xlnet(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    pytorch_weights_dump_path = os.path.join(pytorch_dump_folder_path, WEIGHTS_NAME)
+    pytorch_config_dump_path = os.path.join(pytorch_dump_folder_path, CONFIG_NAME)
+    print("Save PyTorch model to {}".format(os.path.abspath(pytorch_weights_dump_path)))
+    torch.save(model.state_dict(), pytorch_weights_dump_path)
+    print("Save configuration file to {}".format(os.path.abspath(pytorch_config_dump_path)))
+    with open(pytorch_config_dump_path, "w", encoding="utf-8") as f:
+        f.write(config.to_json_string())
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the TensorFlow checkpoint path.")
+    parser.add_argument("--xlnet_config_file",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "The config json file corresponding to the pre-trained XLNet model. \n"
+                               "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the folder to store the PyTorch model or dataset/vocab.")
+    parser.add_argument("--finetuning_task",
+                        default = None,
+                        type = str,
+                        help = "Name of a task on which the XLNet TensorFloaw model was fine-tuned")
+    args = parser.parse_args()
+    print(args)
+
+    convert_xlnet_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                        args.xlnet_config_file,
+                                        args.pytorch_dump_folder_path,
+                                        args.finetuning_task)
--- a/pytorch_pretrained_bert/file_utils.py
+++ b/pytorch_pretrained_bert/file_utils.py
@@ -29,7 +29,7 @@ except ImportError:
    torch_cache_home = os.path.expanduser(
        os.getenv('TORCH_HOME', os.path.join(
            os.getenv('XDG_CACHE_HOME', '~/.cache'), 'torch')))
-default_cache_path = os.path.join(torch_cache_home, 'pytorch_pretrained_bert')
+default_cache_path = os.path.join(torch_cache_home, 'pytorch_transformers')

 try:
    from urllib.parse import urlparse
@@ -44,9 +44,6 @@ except (AttributeError, ImportError):
    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
                                              default_cache_path)

-CONFIG_NAME = "config.json"
-WEIGHTS_NAME = "pytorch_model.bin"
-
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name


@@ -194,6 +191,8 @@ def get_from_cache(url, cache_dir=None):
        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
    if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
        cache_dir = str(cache_dir)
+    if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
+        cache_dir = str(cache_dir)

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
@@ -259,21 +258,3 @@ def get_from_cache(url, cache_dir=None):
            logger.info("removing temp file %s", temp_file.name)

    return cache_path
-
-
-def read_set_from_file(filename):
-    '''
-    Extract a de-duped collection (set) of text from a file.
-    Expected file format is one item per line.
-    '''
-    collection = set()
-    with open(filename, 'r', encoding='utf-8') as file_:
-        for line in file_:
-            collection.add(line.rstrip())
-    return collection
-
-
-def get_file_extension(path, dot=True, lower=True):
-    ext = os.path.splitext(path)[1]
-    ext = ext if dot else ext[1:]
-    return ext.lower() if lower else ext
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -0,0 +1,734 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI GPT-2 model."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from torch.nn.parameter import Parameter
+
+from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
+                             PreTrainedModel, prune_conv1d_layer, SequenceSummary,
+                             add_start_docstrings)
+from .modeling_bert import BertLayerNorm as LayerNorm
+
+logger = logging.getLogger(__name__)
+
+GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
+                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"}
+GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
+                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
+
+def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error("Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(gpt2_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array.squeeze())
+
+    for name, array in zip(names, arrays):
+        name = name[6:]  # skip "model/"
+        name = name.split('/')
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
+                l = re.split(r'(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'w' or l[0] == 'g':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'b':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'wpe' or l[0] == 'wte':
+                pointer = getattr(pointer, l[0])
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+class GPT2Config(PretrainedConfig):
+    """Configuration class to store the configuration of a `GPT2Model`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+        n_positions: Number of positional embeddings.
+        n_ctx: Size of the causal mask (usually same as n_positions).
+        n_embd: Dimensionality of the embeddings and hidden states.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        layer_norm_epsilon: epsilon to use in the layer norm layers
+        resid_pdrop: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attn_pdrop: The dropout ratio for the attention
+            probabilities.
+        embd_pdrop: The dropout ratio for the embeddings.
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+    """
+    pretrained_config_archive_map = GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=50257,
+        n_positions=1024,
+        n_ctx=1024,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+
+        num_labels=1,
+        summary_type='token_ids',
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        **kwargs
+    ):
+        """Constructs GPT2Config.
+
+        Args:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+            n_positions: Number of positional embeddings.
+            n_ctx: Size of the causal mask (usually same as n_positions).
+            n_embd: Dimensionality of the embeddings and hidden states.
+            n_layer: Number of hidden layers in the Transformer encoder.
+            n_head: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            layer_norm_epsilon: epsilon to use in the layer norm layers
+            resid_pdrop: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attn_pdrop: The dropout ratio for the attention
+                probabilities.
+            embd_pdrop: The dropout ratio for the embeddings.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+        """
+        super(GPT2Config, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.n_ctx = n_ctx
+            self.n_positions = n_positions
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.layer_norm_epsilon = layer_norm_epsilon
+            self.initializer_range = initializer_range
+
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_first_dropout = summary_first_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
+    @property
+    def hidden_size(self):
+        return self.n_embd
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
+
+
+
+class Attention(nn.Module):
+    def __init__(self, nx, n_ctx, config, scale=False):
+        super(Attention, self).__init__()
+        self.output_attentions = config.output_attentions
+
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % config.n_head == 0
+        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+
+        self.c_attn = Conv1D(n_state * 3, nx)
+        self.c_proj = Conv1D(n_state, nx)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.n_head, self.split_size // self.n_head)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+        # Update hyper params
+        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
+        self.n_head = self.n_head - len(heads)
+
+    def _attn(self, q, k, v, head_mask=None):
+        w = torch.matmul(q, k)
+        if self.scale:
+            w = w / math.sqrt(v.size(-1))
+        nd, ns = w.size(-2), w.size(-1)
+        b = self.bias[:, :, ns-nd:ns, :ns]
+        w = w * b - 1e4 * (1 - b)
+
+        w = nn.Softmax(dim=-1)(w)
+        w = self.attn_dropout(w)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+
+        outputs = [torch.matmul(w, v)]
+        if self.output_attentions:
+            outputs.append(w)
+        return outputs
+
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, x, k=False):
+        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1)  # (batch, head, head_features, seq_length)
+        else:
+            return x.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+
+    def forward(self, x, layer_past=None, head_mask=None):
+        x = self.c_attn(x)
+        query, key, value = x.split(self.split_size, dim=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+        if layer_past is not None:
+            past_key, past_value = layer_past[0].transpose(-2, -1), layer_past[1]  # transpose back cf below
+            key = torch.cat((past_key, key), dim=-1)
+            value = torch.cat((past_value, value), dim=-2)
+        present = torch.stack((key.transpose(-2, -1), value))  # transpose to have same shapes for stacking
+
+        attn_outputs = self._attn(query, key, value, head_mask)
+        a = attn_outputs[0]
+
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a)
+
+        outputs = [a, present] + attn_outputs[1:]
+        return outputs  # a, present, (attentions)
+
+
+class MLP(nn.Module):
+    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
+        super(MLP, self).__init__()
+        nx = config.n_embd
+        self.c_fc = Conv1D(n_state, nx)
+        self.c_proj = Conv1D(nx, n_state)
+        self.act = gelu
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return self.dropout(h2)
+
+
+class Block(nn.Module):
+    def __init__(self, n_ctx, config, scale=False):
+        super(Block, self).__init__()
+        nx = config.n_embd
+        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.attn = Attention(nx, n_ctx, config, scale)
+        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.mlp = MLP(4 * nx, config)
+
+    def forward(self, x, layer_past=None, head_mask=None):
+        output_attn = self.attn(self.ln_1(x), layer_past=layer_past, head_mask=head_mask)
+        a = output_attn[0]  # output_attn: a, present, (attentions)
+
+        x = x + a
+        m = self.mlp(self.ln_2(x))
+        x = x + m
+
+        outputs = [x] + output_attn[1:]
+        return outputs  # x, present, (attentions)
+
+
+class GPT2PreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = GPT2Config
+    pretrained_model_archive_map = GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_gpt2
+    base_model_prefix = "transformer"
+
+    def __init__(self, *inputs, **kwargs):
+        super(GPT2PreTrainedModel, self).__init__(*inputs, **kwargs)
+
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
+    `Language Models are Unsupervised Multitask Learners`_
+    by Alec Radford*, Jeffrey Wu*, Rewon Child, David Luan, Dario Amodei** and Ilya Sutskever**.
+    It's a causal (unidirectional) transformer pre-trained using  language modeling on a very large
+    corpus of ~40 GB of text data.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Language Models are Unsupervised Multitask Learners`:
+        https://openai.com/blog/better-language-models/
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+"""
+
+GPT2_INPUTS_DOCSTRING = r"""    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `past` output below). Can be used to speed up sequential decoding.
+        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare GPT2 Model transformer outputing raw hidden-states without any specific head on top.",
+                      GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+class GPT2Model(GPT2PreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            that contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+
+    Examples::
+
+        >>> config = GPT2Config.from_pretrained('gpt2')
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = GPT2Model(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config):
+        super(GPT2Model, self).__init__(config)
+        self.output_hidden_states = config.output_hidden_states
+        self.output_attentions = config.output_attentions
+
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.wpe = nn.Embedding(config.n_positions, config.n_embd)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
+        self.ln_f = LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+
+        self.apply(self.init_weights)
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.wte = self._get_resized_embeddings(self.wte, new_num_tokens)
+        return self.wte
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, past=None, head_mask=None):
+        if past is None:
+            past_length = 0
+            past = [None] * len(self.h)
+        else:
+            past_length = past[0][0].size(-2)
+        if position_ids is None:
+            position_ids = torch.arange(past_length, input_ids.size(-1) + past_length, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.n_layer
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        position_ids = position_ids.view(-1, position_ids.size(-1))
+
+        inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+            token_type_embeds = self.wte(token_type_ids)
+        else:
+            token_type_embeds = 0
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        presents = ()
+        all_attentions = []
+        all_hidden_states = ()
+        for i, (block, layer_past) in enumerate(zip(self.h, past)):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
+
+            outputs = block(hidden_states, layer_past, head_mask[i])
+            hidden_states, present = outputs[:2]
+            presents = presents + (present,)
+
+            if self.output_attentions:
+                all_attentions.append(outputs[2])
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = hidden_states.view(*output_shape)
+        # Add last hidden state
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states, presents)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            # let the number of heads free (-1) so we can extract attention even after head pruning
+            attention_output_shape = input_shape[:-1] + (-1,) + all_attentions[0].shape[-2:]
+            all_attentions = tuple(t.view(*attention_output_shape) for t in all_attentions)
+            outputs = outputs + (all_attentions,)
+        return outputs  # last hidden state, presents, (all hidden_states), (attentions)
+
+
+@add_start_docstrings("""The GPT2 Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """, GPT2_START_DOCSTRING, GPT2_INPUTS_DOCSTRING)
+class GPT2LMHeadModel(GPT2PreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            that contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+
+    Examples::
+
+        >>> config = GPT2Config.from_pretrained('gpt2')
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = GPT2LMHeadModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=input_ids)
+        >>> loss, logits = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(GPT2LMHeadModel, self).__init__(config)
+        self.transformer = GPT2Model(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
+        self.apply(self.init_weights)
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.lm_head,
+                                   self.transformer.wte)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, past=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                                               past=past, head_mask=head_mask)
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        outputs = (lm_logits,) + transformer_outputs[1:]
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
+
+
+@add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
+head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
+The language modeling head has its weights tied to the input embeddings,
+the classification head takes as input the input of a specified classification token index in the intput sequence).
+""", GPT2_START_DOCSTRING)
+class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
+    r"""    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            The second dimension of the input (`num_choices`) indicates the number of choices to score.
+            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
+            Index of the classification token in each input sequence.
+            Selected in the range ``[0, input_ids.size(-1) - 1[``.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer):
+            that contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model
+            (see `past` output below). Can be used to speed up sequential decoding.
+        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+        **multiple_choice_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
+            `multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
+                with indices selected in [0, ..., num_choices].
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **mc_loss**: (`optional`, returned when ``multiple_choice_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Multiple choice classification loss.
+        **lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
+            Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
+        **past**:
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            that contains pre-computed hidden-states (key and values in the attention blocks).
+            Can be used (see `past` input) to speed up sequential decoding.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+
+    Examples::
+
+        >>> config = GPT2Config.from_pretrained('gpt2')
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = GPT2DoubleHeadsModel(config)
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
+        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        >>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, mc_token_ids)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(GPT2DoubleHeadsModel, self).__init__(config)
+        self.transformer = GPT2Model(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.multiple_choice_head = SequenceSummary(config)
+
+        self.apply(self.init_weights)
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.lm_head,
+                                   self.transformer.wte)
+
+    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
+                position_ids=None, past=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                                               past=past, head_mask=head_mask)
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
+
+        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
+                            mc_labels.view(-1))
+            outputs = (loss,) + outputs
+        if lm_labels is not None:
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (lm loss), (mc loss), lm logits, mc logits, presents, (all hidden_states), (attentions)
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -0,0 +1,718 @@
+# coding=utf-8
+# Copyright 2018 The OpenAI Team Authors and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI GPT model."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from torch.nn.parameter import Parameter
+
+from .modeling_utils import (Conv1D, CONFIG_NAME, WEIGHTS_NAME, PretrainedConfig,
+                             PreTrainedModel, prune_conv1d_layer, SequenceSummary,
+                             add_start_docstrings)
+from .modeling_bert import BertLayerNorm as LayerNorm
+
+logger = logging.getLogger(__name__)
+
+OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-pytorch_model.bin"}
+OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP = {"openai-gpt": "https://s3.amazonaws.com/models.huggingface.co/bert/openai-gpt-config.json"}
+
+
+def load_tf_weights_in_openai_gpt(model, config, openai_checkpoint_folder_path):
+    """ Load tf pre-trained weights in a pytorch model (from NumPy arrays here)
+    """
+    import re
+    import numpy as np
+
+    if '.ckpt' in openai_checkpoint_folder_path:
+        openai_checkpoint_folder_path = os.path.dirname(openai_checkpoint_folder_path)
+
+    logger.info("Loading weights from {}".format(openai_checkpoint_folder_path))
+
+    names = json.load(open(openai_checkpoint_folder_path + '/parameters_names.json', "r", encoding='utf-8'))
+    shapes = json.load(open(openai_checkpoint_folder_path + '/params_shapes.json', "r", encoding='utf-8'))
+    offsets = np.cumsum([np.prod(shape) for shape in shapes])
+    init_params = [np.load(openai_checkpoint_folder_path + '/params_{}.npy'.format(n)) for n in range(10)]
+    init_params = np.split(np.concatenate(init_params, 0), offsets)[:-1]
+    init_params = [param.reshape(shape) for param, shape in zip(init_params, shapes)]
+
+    # This was used when we had a single embedding matrix for positions and tokens
+    # init_params[0] = np.concatenate([init_params[1], init_params[0]], 0)
+    # del init_params[1]
+    init_params = [arr.squeeze() for arr in init_params]
+
+    try:
+        assert model.tokens_embed.weight.shape == init_params[1].shape
+        assert model.positions_embed.weight.shape == init_params[0].shape
+    except AssertionError as e:
+        e.args += (model.tokens_embed.weight.shape, init_params[1].shape)
+        e.args += (model.positions_embed.weight.shape, init_params[0].shape)
+        raise
+
+    model.tokens_embed.weight.data = torch.from_numpy(init_params[1])
+    model.positions_embed.weight.data = torch.from_numpy(init_params[0])
+    names.pop(0)
+    # Pop position and token embedding arrays
+    init_params.pop(0)
+    init_params.pop(0)
+
+    for name, array in zip(names, init_params): # names[1:n_transfer], init_params[1:n_transfer]):
+        name = name[6:]  # skip "model/"
+        assert name[-2:] == ":0"
+        name = name[:-2]
+        name = name.split('/')
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+\d+', m_name):
+                l = re.split(r'(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'g':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'b':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'w':
+                pointer = getattr(pointer, 'weight')
+            else:
+                pointer = getattr(pointer, l[0])
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+def gelu(x):
+    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+
+
+def swish(x):
+    return x * torch.sigmoid(x)
+
+
+ACT_FNS = {"relu": nn.ReLU, "swish": swish, "gelu": gelu}
+
+
+class OpenAIGPTConfig(PretrainedConfig):
+    """
+    Configuration class to store the configuration of a `OpenAIGPTModel`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+        n_special: The number of special tokens to learn during fine-tuning ('[SEP]', '[CLF]', ...)
+        n_positions: Number of positional embeddings.
+        n_ctx: Size of the causal mask (usually same as n_positions).
+        n_embd: Dimensionality of the embeddings and hidden states.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        afn: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        resid_pdrop: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        attn_pdrop: The dropout ratio for the attention
+            probabilities.
+        embd_pdrop: The dropout ratio for the embeddings.
+        layer_norm_epsilon: epsilon to use in the layer norm layers
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        predict_special_tokens: should we predict special tokens (when the model has a LM head)
+    """
+    pretrained_config_archive_map = OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(
+        self,
+        vocab_size_or_config_json_file=40478,
+        n_positions=512,
+        n_ctx=512,
+        n_embd=768,
+        n_layer=12,
+        n_head=12,
+        afn="gelu",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        predict_special_tokens=True,
+
+        num_labels=1,
+        summary_type='token_ids',
+        summary_use_proj=True,
+        summary_activation=None,
+        summary_proj_to_labels=True,
+        summary_first_dropout=0.1,
+        **kwargs
+    ):
+        """Constructs OpenAIGPTConfig.
+        """
+        super(OpenAIGPTConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.vocab_size = vocab_size_or_config_json_file
+            self.n_ctx = n_ctx
+            self.n_positions = n_positions
+            self.n_embd = n_embd
+            self.n_layer = n_layer
+            self.n_head = n_head
+            self.afn = afn
+            self.resid_pdrop = resid_pdrop
+            self.embd_pdrop = embd_pdrop
+            self.attn_pdrop = attn_pdrop
+            self.layer_norm_epsilon = layer_norm_epsilon
+            self.initializer_range = initializer_range
+            self.predict_special_tokens = predict_special_tokens
+
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_first_dropout = summary_first_dropout
+            self.summary_proj_to_labels = summary_proj_to_labels
+        else:
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
+    @property
+    def hidden_size(self):
+        return self.n_embd
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
+
+
+class Attention(nn.Module):
+    def __init__(self, nx, n_ctx, config, scale=False):
+        super(Attention, self).__init__()
+        n_state = nx  # in Attention: n_state=768 (nx=n_embd)
+        # [switch nx => n_state from Block to Attention to keep identical to TF implem]
+        assert n_state % config.n_head == 0
+        self.register_buffer("bias", torch.tril(torch.ones(n_ctx, n_ctx)).view(1, 1, n_ctx, n_ctx))
+        self.n_head = config.n_head
+        self.split_size = n_state
+        self.scale = scale
+
+        self.output_attentions = config.output_attentions
+
+        self.c_attn = Conv1D(n_state * 3, nx)
+        self.c_proj = Conv1D(n_state, nx)
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.n_head, self.split_size // self.n_head)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        index_attn = torch.cat([index, index + self.split_size, index + (2*self.split_size)])
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+        # Update hyper params
+        self.split_size = (self.split_size // self.n_head) * (self.n_head - len(heads))
+        self.n_head = self.n_head - len(heads)
+
+    def _attn(self, q, k, v, head_mask=None):
+        w = torch.matmul(q, k)
+        if self.scale:
+            w = w / math.sqrt(v.size(-1))
+        # w = w * self.bias + -1e9 * (1 - self.bias)  # TF implem method: mask_attn_weights
+        # XD: self.b may be larger than w, so we need to crop it
+        b = self.bias[:, :, : w.size(-2), : w.size(-1)]
+        w = w * b + -1e9 * (1 - b)
+
+        w = nn.Softmax(dim=-1)(w)
+        w = self.attn_dropout(w)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            w = w * head_mask
+
+        outputs = [torch.matmul(w, v)]
+        if self.output_attentions:
+            outputs.append(w)
+        return outputs
+
+    def merge_heads(self, x):
+        x = x.permute(0, 2, 1, 3).contiguous()
+        new_x_shape = x.size()[:-2] + (x.size(-2) * x.size(-1),)
+        return x.view(*new_x_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, x, k=False):
+        new_x_shape = x.size()[:-1] + (self.n_head, x.size(-1) // self.n_head)
+        x = x.view(*new_x_shape)  # in Tensorflow implem: fct split_states
+        if k:
+            return x.permute(0, 2, 3, 1)
+        else:
+            return x.permute(0, 2, 1, 3)
+
+    def forward(self, x, head_mask=None):
+        x = self.c_attn(x)
+        query, key, value = x.split(self.split_size, dim=2)
+        query = self.split_heads(query)
+        key = self.split_heads(key, k=True)
+        value = self.split_heads(value)
+
+        attn_outputs = self._attn(query, key, value, head_mask)
+        a = attn_outputs[0]
+
+        a = self.merge_heads(a)
+        a = self.c_proj(a)
+        a = self.resid_dropout(a)
+
+        outputs = [a] + attn_outputs[1:]
+        return outputs  # a, (attentions)
+
+
+class MLP(nn.Module):
+    def __init__(self, n_state, config):  # in MLP: n_state=3072 (4 * n_embd)
+        super(MLP, self).__init__()
+        nx = config.n_embd
+        self.c_fc = Conv1D(n_state, nx)
+        self.c_proj = Conv1D(nx, n_state)
+        self.act = ACT_FNS[config.afn]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, x):
+        h = self.act(self.c_fc(x))
+        h2 = self.c_proj(h)
+        return self.dropout(h2)
+
+
+class Block(nn.Module):
+    def __init__(self, n_ctx, config, scale=False):
+        super(Block, self).__init__()
+        nx = config.n_embd
+        self.attn = Attention(nx, n_ctx, config, scale)
+        self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+        self.mlp = MLP(4 * nx, config)
+        self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
+
+    def forward(self, x, head_mask=None):
+        attn_outputs = self.attn(x, head_mask=head_mask)
+        a = attn_outputs[0]
+
+        n = self.ln_1(x + a)
+        m = self.mlp(n)
+        h = self.ln_2(n + m)
+
+        outputs = [h] + attn_outputs[1:]
+        return outputs
+
+
+class OpenAIGPTPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = OpenAIGPTConfig
+    pretrained_model_archive_map = OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_openai_gpt
+    base_model_prefix = "transformer"
+
+    def __init__(self, *inputs, **kwargs):
+        super(OpenAIGPTPreTrainedModel, self).__init__(*inputs, **kwargs)
+
+    def init_weights(self, module):
+        """ Initialize the weights.
+        """
+        if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
+    `Improving Language Understanding by Generative Pre-Training`_
+    by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.
+    It's a causal (unidirectional) transformer pre-trained using language modeling on a large
+    corpus will long range dependencies, the Toronto Book Corpus.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Improving Language Understanding by Generative Pre-Training`:
+        https://openai.com/blog/language-unsupervised/
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
+"""
+
+OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare OpenAI GPT transformer model outputing raw hidden-states without any specific head on top.",
+                      OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+
+    Examples::
+
+        >>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
+        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        >>> model = OpenAIGPTModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config):
+        super(OpenAIGPTModel, self).__init__(config)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+
+        self.tokens_embed = nn.Embedding(config.vocab_size, config.n_embd)
+        self.positions_embed = nn.Embedding(config.n_positions, config.n_embd)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
+
+        self.apply(self.init_weights)
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.tokens_embed = self._get_resized_embeddings(self.tokens_embed, new_num_tokens)
+        return self.tokens_embed
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, head_mask=None):
+        if position_ids is None:
+            # This was used when we had a single embedding matrice from position and token embeddings
+            # start = self.config.vocab_size + self.config.n_special
+            # end = start + input_ids.size(-1)
+            # position_ids = torch.arange(start, end, dtype=torch.long, device=input_ids.device)
+            position_ids = torch.arange(input_ids.size(-1), dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.n_layer
+
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_ids.size(-1))
+        position_ids = position_ids.view(-1, position_ids.size(-1))
+
+        inputs_embeds = self.tokens_embed(input_ids)
+        position_embeds = self.positions_embed(position_ids)
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1))
+            token_type_embeds = self.tokens_embed(token_type_ids)
+        else:
+            token_type_embeds = 0
+        hidden_states = inputs_embeds + position_embeds + token_type_embeds
+        hidden_states = self.drop(hidden_states)
+
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        all_attentions = ()
+        all_hidden_states = ()
+        for i, block in enumerate(self.h):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
+
+            outputs = block(hidden_states, head_mask[i])
+            hidden_states = outputs[0]
+            if self.output_attentions:
+                all_attentions = all_attentions + (outputs[1],)
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states.view(*output_shape),)
+
+        outputs = (hidden_states.view(*output_shape),)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last hidden state, (all hidden states), (all attentions)
+
+
+@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling head on top
+(linear layer with weights tied to the input embeddings). """, OPENAI_GPT_START_DOCSTRING, OPENAI_GPT_INPUTS_DOCSTRING)
+class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+
+    Examples::
+
+        >>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
+        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        >>> model = OpenAIGPTLMHeadModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=input_ids)
+        >>> loss, logits = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(OpenAIGPTLMHeadModel, self).__init__(config)
+        self.transformer = OpenAIGPTModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
+        self.apply(self.init_weights)
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.lm_head,
+                                   self.transformer.tokens_embed)
+
+    def forward(self, input_ids, position_ids=None, token_type_ids=None, labels=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                                               head_mask=head_mask)
+        hidden_states = transformer_outputs[0]
+        lm_logits = self.lm_head(hidden_states)
+
+        outputs = (lm_logits,) + transformer_outputs[1:]
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), lm_logits, (all hidden states), (all attentions)
+
+
+@add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
+head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
+The language modeling head has its weights tied to the input embeddings,
+the classification head takes as input the input of a specified classification token index in the intput sequence).
+""", OPENAI_GPT_START_DOCSTRING)
+class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
+    r"""    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            The second dimension of the input (`num_choices`) indicates the number of choices to score.
+            Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **mc_token_ids**: ``torch.LongTensor`` of shape ``(batch_size, num_choices)``:
+            Index of the classification token in each input sequence.
+            Selected in the range ``[0, input_ids.size(-1) - 1[``.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, num_choices, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+        **multiple_choice_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size)``:
+            Labels for computing the multiple choice classification loss.
+            Indices should be in ``[0, ..., num_choices]`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above)
+
+            `multiple_choice_labels`: optional multiple choice labels: ``torch.LongTensor`` of shape [batch_size]
+                with indices selected in [0, ..., num_choices].
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **mc_loss**: (`optional`, returned when ``multiple_choice_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Multiple choice classification loss.
+        **lm_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **mc_prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)``
+            Prediction scores of the multiplechoice classification head (scores for each choice before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+
+    Examples::
+
+        >>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
+        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        >>> model = OpenAIGPTDoubleHeadsModel(config)
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
+        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        >>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, mc_token_ids)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
+
+        self.transformer = OpenAIGPTModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+        self.multiple_choice_head = SequenceSummary(config)
+
+        self.apply(self.init_weights)
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.lm_head,
+                                   self.transformer.tokens_embed)
+
+    def forward(self, input_ids, mc_token_ids=None, lm_labels=None, mc_labels=None, token_type_ids=None,
+                position_ids=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                                               head_mask=head_mask)
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+        mc_logits = self.multiple_choice_head(hidden_states, mc_token_ids).squeeze(-1)
+
+        outputs = (lm_logits, mc_logits) + transformer_outputs[1:]
+        if mc_labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(mc_logits.view(-1, mc_logits.size(-1)),
+                            mc_labels.view(-1))
+            outputs = (loss,) + outputs
+        if lm_labels is not None:
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (lm loss), (mc loss), lm logits, mc logits, (all hidden_states), (attentions)
--- a/pytorch_pretrained_bert/modeling_transfo_xl.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl.py
--- a/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
+++ b/pytorch_pretrained_bert/modeling_transfo_xl_utilities.py
@@ -89,13 +89,13 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):

        return logit

-    def forward(self, hidden, target=None, keep_order=False):
+    def forward(self, hidden, labels=None, keep_order=False):
        '''
            Params:
                hidden :: [len*bsz x d_proj]
-                target :: [len*bsz]
+                labels :: [len*bsz]
            Return:
-                if target is None:
+                if labels is None:
                    out :: [len*bsz] Negative log likelihood
                else:
                    out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary
@@ -104,18 +104,18 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
            here: https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
        '''

-        if target is not None:
-            target = target.view(-1)
-            if hidden.size(0) != target.size(0):
-                raise RuntimeError('Input and target should have the same size '
+        if labels is not None:
+            labels = labels.view(-1)
+            if hidden.size(0) != labels.size(0):
+                raise RuntimeError('Input and labels should have the same size '
                                'in the batch dimension.')

        if self.n_clusters == 0:
            logit = self._compute_logit(hidden, self.out_layers[0].weight,
                                        self.out_layers[0].bias, self.out_projs[0])
-            if target is not None:
+            if labels is not None:
                out = -F.log_softmax(logit, dim=-1) \
-                        .gather(1, target.unsqueeze(1)).squeeze(1)
+                        .gather(1, labels.unsqueeze(1)).squeeze(1)
            else:
                out = F.log_softmax(logit, dim=-1)
        else:
@@ -144,31 +144,31 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
            head_logprob = F.log_softmax(head_logit, dim=1)

-            if target is None:
+            if labels is None:
                out = hidden.new_empty((head_logit.size(0), self.n_token))
            else:
-                out = torch.zeros_like(target, dtype=hidden.dtype, device=hidden.device)
+                out = torch.zeros_like(labels, dtype=hidden.dtype, device=hidden.device)

            offset = 0
            cutoff_values = [0] + self.cutoffs
            for i in range(len(cutoff_values) - 1):
                l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]

-                if target is not None:
-                    mask_i = (target >= l_idx) & (target < r_idx)
+                if labels is not None:
+                    mask_i = (labels >= l_idx) & (labels < r_idx)
                    indices_i = mask_i.nonzero().squeeze()

                    if indices_i.numel() == 0:
                        continue

-                    target_i = target.index_select(0, indices_i) - l_idx
+                    target_i = labels.index_select(0, indices_i) - l_idx
                    head_logprob_i = head_logprob.index_select(0, indices_i)
                    hidden_i = hidden.index_select(0, indices_i)
                else:
                    hidden_i = hidden

                if i == 0:
-                    if target is not None:
+                    if labels is not None:
                        logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)
                    else:
                        out[:, :self.cutoffs[0]] = head_logprob[:, :self.cutoffs[0]]
@@ -178,14 +178,14 @@ class ProjectedAdaptiveLogSoftmax(nn.Module):
                    tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
                    tail_logprob_i = F.log_softmax(tail_logit_i, dim=1)
                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
-                    if target is not None:
+                    if labels is not None:
                        logprob_i = head_logprob_i[:, cluster_prob_idx] \
                                + tail_logprob_i.gather(1, target_i[:, None]).squeeze(1)
                    else:
                        logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i
                        out[:, l_idx:r_idx] = logprob_i

-                if target is not None:
+                if labels is not None:
                    if (hasattr(self, 'keep_order') and self.keep_order) or keep_order:
                        out.index_copy_(0, indices_i, -logprob_i)
                    else:
@@ -272,7 +272,6 @@ class LogUniformSampler(object):
            self.range_max = range_max
            log_indices = torch.arange(1., range_max+2., 1.).log_()
            self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
-            # print('P', self.dist.numpy().tolist()[-30:])

            self.log_q = (- (-self.dist.double().log1p_() * 2 * n_sample).expm1_()).log_().float()

@@ -331,72 +330,3 @@ def sample_logits(embedding, bias, labels, inputs, sampler):
    logits = torch.cat([true_logits[:, :, None], sample_logits], -1)

    return logits
-
-
-# class LogUniformSampler(object):
-#     def __init__(self, range_max, unique=False):
-#         """
-#         Reference : https://github.com/tensorflow/tensorflow/blob/r1.10/tensorflow/python/ops/candidate_sampling_ops.py
-#             `P(class) = (log(class + 2) - log(class + 1)) / log(range_max + 1)`
-#         """
-#         self.range_max = range_max
-#         log_indices = torch.arange(1., range_max+2., 1.).log_()
-#         self.dist = (log_indices[1:] - log_indices[:-1]) / log_indices[-1]
-
-#         self.unique = unique
-
-#         if self.unique:
-#             self.exclude_mask = torch.ByteTensor(range_max).fill_(0)
-
-#     def sample(self, n_sample, labels):
-#         pos_sample, new_labels = labels.unique(return_inverse=True)
-#         n_pos_sample = pos_sample.size(0)
-#         n_neg_sample = n_sample - n_pos_sample
-
-#         if self.unique:
-#             self.exclude_mask.index_fill_(0, pos_sample, 1)
-#             sample_dist = self.dist.clone().masked_fill_(self.exclude_mask, 0)
-#             self.exclude_mask.index_fill_(0, pos_sample, 0)
-#         else:
-#             sample_dist = self.dist
-
-#         neg_sample = torch.multinomial(sample_dist, n_neg_sample)
-
-#         sample = torch.cat([pos_sample, neg_sample])
-#         sample_prob = self.dist[sample]
-
-#         return new_labels, sample, sample_prob
-
-
-if __name__ == '__main__':
-    S, B = 3, 4
-    n_vocab = 10000
-    n_sample = 5
-    H = 32
-
-    labels = torch.LongTensor(S, B).random_(0, n_vocab)
-
-    # sampler = LogUniformSampler(n_vocab, unique=False)
-    # new_labels, sample, sample_prob = sampler.sample(n_sample, labels)
-
-    sampler = LogUniformSampler(n_vocab, n_sample)#, unique=True)
-    # true_probs, samp_probs, neg_samples = sampler.sample(n_sample, labels)
-
-    # print('true_probs', true_probs.numpy().tolist())
-    # print('samp_probs', samp_probs.numpy().tolist())
-    # print('neg_samples', neg_samples.numpy().tolist())
-
-    # print('sum', torch.sum(sampler.dist).item())
-
-    # assert torch.all(torch.sort(sample.unique())[0].eq(torch.sort(sample)[0])).item()
-
-    embedding = nn.Embedding(n_vocab, H)
-    bias = torch.zeros(n_vocab)
-    inputs = torch.Tensor(S, B, H).normal_()
-
-    logits, out_labels = sample_logits(embedding, bias, labels, inputs, sampler, n_sample)
-    print('logits', logits.detach().numpy().tolist())
-    print('logits shape', logits.size())
-    print('out_labels', out_labels.detach().numpy().tolist())
-    print('out_labels shape', out_labels.size())
-
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -0,0 +1,839 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BERT model."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import copy
+import json
+import logging
+import os
+from io import open
+
+import six
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.nn import functional as F
+
+from .file_utils import cached_path
+
+logger = logging.getLogger(__name__)
+
+CONFIG_NAME = "config.json"
+WEIGHTS_NAME = "pytorch_model.bin"
+TF_WEIGHTS_NAME = 'model.ckpt'
+
+
+if not six.PY2:
+    def add_start_docstrings(*docstr):
+        def docstring_decorator(fn):
+            fn.__doc__ = ''.join(docstr) + fn.__doc__
+            return fn
+        return docstring_decorator
+else:
+    # Not possible to update class docstrings on python2
+    def add_start_docstrings(*docstr):
+        def docstring_decorator(fn):
+            return fn
+        return docstring_decorator
+
+
+class PretrainedConfig(object):
+    """ Base class for all configuration classes.
+        Handle a few common parameters and methods for loading/downloading/saving configurations.
+    """
+    pretrained_config_archive_map = {}
+
+    def __init__(self, **kwargs):
+        self.finetuning_task = kwargs.pop('finetuning_task', None)
+        self.num_labels = kwargs.pop('num_labels', 2)
+        self.output_attentions = kwargs.pop('output_attentions', False)
+        self.output_hidden_states = kwargs.pop('output_hidden_states', False)
+        self.torchscript = kwargs.pop('torchscript', False)
+
+    def save_pretrained(self, save_directory):
+        """ Save a configuration object to a directory, so that it
+            can be re-loaded using the `from_pretrained(save_directory)` class method.
+        """
+        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_config_file = os.path.join(save_directory, CONFIG_NAME)
+
+        self.to_json_file(output_config_file)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *input, **kwargs):
+        r""" Instantiate a PretrainedConfig from a pre-trained model configuration.
+
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a saved configuration `file`.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **kwargs**: (`optional`) dict:
+                Dictionnary of key, values to update the configuration object after loading.
+                Can be used to override selected configuration parameters.
+
+        Examples::
+
+            >>> config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            >>> config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            >>> config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
+            >>> config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True)
+            >>> assert config.output_attention == True
+
+        """
+        cache_dir = kwargs.pop('cache_dir', None)
+
+        if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
+            config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
+        elif os.path.isdir(pretrained_model_name_or_path):
+            config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
+        else:
+            config_file = pretrained_model_name_or_path
+        # redirect to the cache, if necessary
+        try:
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
+                        config_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(cls.pretrained_config_archive_map.keys()),
+                        config_file))
+            return None
+        if resolved_config_file == config_file:
+            logger.info("loading configuration file {}".format(config_file))
+        else:
+            logger.info("loading configuration file {} from cache at {}".format(
+                config_file, resolved_config_file))
+
+        # Load config
+        config = cls.from_json_file(resolved_config_file)
+
+        # Update config with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(config, key):
+                setattr(config, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info("Model config %s", config)
+        return config
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `Config` from a Python dictionary of parameters."""
+        config = cls(vocab_size_or_config_json_file=-1)
+        for key, value in json_object.items():
+            config.__dict__[key] = value
+        return config
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `BertConfig` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        return cls.from_dict(json.loads(text))
+
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
+
+
+class PreTrainedModel(nn.Module):
+    """ Base class for all models. Handle loading/storing model config and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = PretrainedConfig
+    pretrained_model_archive_map = {}
+    load_tf_weights = lambda model, config, path: None
+    base_model_prefix = ""
+    input_embeddings = None
+
+    def __init__(self, config, *inputs, **kwargs):
+        super(PreTrainedModel, self).__init__()
+        if not isinstance(config, PretrainedConfig):
+            raise ValueError(
+                "Parameter config in `{}(config)` should be an instance of class `PretrainedConfig`. "
+                "To create a model from a pretrained model use "
+                "`model = {}.from_pretrained(PRETRAINED_MODEL_NAME)`".format(
+                    self.__class__.__name__, self.__class__.__name__
+                ))
+        # Save config in model
+        self.config = config
+
+    def _get_resized_embeddings(self, old_embeddings, new_num_tokens=None):
+        """ Build a resized Embedding Module from a provided token Embedding Module.
+            Increasing the size will add newly initialized vectors at the end
+            Reducing the size will remove vectors from the end
+
+        Args:
+            new_num_tokens: (`optional`) int
+                New number of tokens in the embedding matrix.
+                Increasing the size will add newly initialized vectors at the end
+                Reducing the size will remove vectors from the end
+                If not provided or None: return the provided token Embedding Module.
+        Return: ``torch.nn.Embeddings``
+            Pointer to the resized Embedding Module or the old Embedding Module if new_num_tokens is None
+        """
+        if new_num_tokens is None:
+            return old_embeddings
+
+        old_num_tokens, old_embedding_dim = old_embeddings.weight.size()
+        if old_num_tokens == new_num_tokens:
+            return old_embeddings
+
+        # Build new embeddings
+        new_embeddings = nn.Embedding(new_num_tokens, old_embedding_dim)
+        new_embeddings.to(old_embeddings.weight.device)
+
+        # initialize all new embeddings (in particular added tokens)
+        self.init_weights(new_embeddings)
+
+        # Copy word embeddings from the previous weights
+        num_tokens_to_copy = min(old_num_tokens, new_num_tokens)
+        new_embeddings.weight.data[:num_tokens_to_copy, :] = old_embeddings.weight.data[:num_tokens_to_copy, :]
+
+        return new_embeddings
+
+    def _tie_or_clone_weights(self, first_module, second_module):
+        """ Tie or clone module weights depending of weither we are using TorchScript or not
+        """
+        if self.config.torchscript:
+            first_module.weight = nn.Parameter(second_module.weight.clone())
+        else:
+            first_module.weight = second_module.weight
+
+    def resize_token_embeddings(self, new_num_tokens=None):
+        """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
+            Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+
+        Args:
+            new_num_tokens: (`optional`) int
+                New number of tokens in the embedding matrix.
+                Increasing the size will add newly initialized vectors at the end
+                Reducing the size will remove vectors from the end
+                If not provided or None: does nothing and just returns a pointer to the input tokens Embedding Module of the model.
+
+        Return: ``torch.nn.Embeddings``
+            Pointer to the input tokens Embedding Module of the model
+        """
+        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+        model_embeds = base_model._resize_token_embeddings(new_num_tokens)
+        if new_num_tokens is None:
+            return model_embeds
+
+        # Update base model and current model config
+        self.config.vocab_size = new_num_tokens
+        base_model.vocab_size = new_num_tokens
+
+        # Tie weights again if needed
+        if hasattr(self, 'tie_weights'):
+            self.tie_weights()
+
+        return model_embeds
+
+    def prune_heads(self, heads_to_prune):
+        """ Prunes heads of the base model.
+            Args:
+                heads_to_prune: dict of {layer_num (int): list of heads to prune in this layer (list of int)}
+        """
+        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+        base_model._prune_heads(heads_to_prune)
+
+    def save_pretrained(self, save_directory):
+        """ Save a model with its configuration file to a directory, so that it
+            can be re-loaded using the `from_pretrained(save_directory)` class method.
+        """
+        assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
+
+        # Only save the model it-self if we are using distributed training
+        model_to_save = self.module if hasattr(self, 'module') else self
+
+        # Save configuration file
+        model_to_save.config.save_pretrained(save_directory)
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_file = os.path.join(save_directory, WEIGHTS_NAME)
+
+        torch.save(model_to_save.state_dict(), output_model_file)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        r""" Instantiate a PretrainedConfig from a pre-trained model configuration.
+
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
+                    In this case, ``from_tf`` should be set to True and a configuration object should be
+                    provided as `config` argument. This loading option is slower than converting the TensorFlow
+                    checkpoint in a PyTorch model using the provided conversion scripts and loading
+                    the PyTorch model afterwards.
+            **config**: an optional configuration for the model to use instead of an automatically loaded configuation.
+                Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
+                - the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
+            **state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
+                from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuraton but load your own weights.
+                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
+                a simpler option.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **output_loading_info**: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+            **kwargs**: (`optional`) dict:
+                Dictionnary of key, values to update the configuration object after loading.
+                Can be used to override selected configuration parameters. E.g. ``output_attention=True``
+
+        Examples::
+
+            >>> model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            >>> model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            >>> model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            >>> assert model.config.output_attention == True
+            >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            >>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
+            >>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        config = kwargs.pop('config', None)
+        state_dict = kwargs.pop('state_dict', None)
+        cache_dir = kwargs.pop('cache_dir', None)
+        from_tf = kwargs.pop('from_tf', False)
+        output_loading_info = kwargs.pop('output_loading_info', False)
+
+        # Load config
+        if config is None:
+            config = cls.config_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+
+        # Load model
+        if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+            archive_file = cls.pretrained_model_archive_map[pretrained_model_name_or_path]
+        elif os.path.isdir(pretrained_model_name_or_path):
+            if from_tf:
+                # Directly load from a TensorFlow checkpoint
+                archive_file = os.path.join(pretrained_model_name_or_path, TF_WEIGHTS_NAME + ".index")
+            else:
+                archive_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+        else:
+            if from_tf:
+                # Directly load from a TensorFlow checkpoint
+                archive_file = pretrained_model_name_or_path + ".index"
+            else:
+                archive_file = pretrained_model_name_or_path
+        # redirect to the cache, if necessary
+        try:
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+        except EnvironmentError:
+            if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
+                logger.error(
+                    "Couldn't reach server at '{}' to download pretrained weights.".format(
+                        archive_file))
+            else:
+                logger.error(
+                    "Model name '{}' was not found in model name list ({}). "
+                    "We assumed '{}' was a path or url but couldn't find any file "
+                    "associated to this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(cls.pretrained_model_archive_map.keys()),
+                        archive_file))
+            return None
+        if resolved_archive_file == archive_file:
+            logger.info("loading weights file {}".format(archive_file))
+        else:
+            logger.info("loading weights file {} from cache at {}".format(
+                archive_file, resolved_archive_file))
+
+        # Instantiate model.
+        model = cls(config)
+
+        if state_dict is None and not from_tf:
+            state_dict = torch.load(resolved_archive_file, map_location='cpu')
+        if from_tf:
+            # Directly load from a TensorFlow checkpoint
+            return cls.load_tf_weights(model, config, resolved_archive_file[:-6])  # Remove the '.index'
+
+        # Convert old format to new format if needed from a PyTorch state_dict
+        old_keys = []
+        new_keys = []
+        for key in state_dict.keys():
+            new_key = None
+            if 'gamma' in key:
+                new_key = key.replace('gamma', 'weight')
+            if 'beta' in key:
+                new_key = key.replace('beta', 'bias')
+            if new_key:
+                old_keys.append(key)
+                new_keys.append(new_key)
+        for old_key, new_key in zip(old_keys, new_keys):
+            state_dict[new_key] = state_dict.pop(old_key)
+
+        # Load from a PyTorch state_dict
+        missing_keys = []
+        unexpected_keys = []
+        error_msgs = []
+        # copy state_dict so _load_from_state_dict can modify it
+        metadata = getattr(state_dict, '_metadata', None)
+        state_dict = state_dict.copy()
+        if metadata is not None:
+            state_dict._metadata = metadata
+
+        def load(module, prefix=''):
+            local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+            module._load_from_state_dict(
+                state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs)
+            for name, child in module._modules.items():
+                if child is not None:
+                    load(child, prefix + name + '.')
+
+        # Make sure we are able to load base models as well as derived models (with heads)
+        start_prefix = ''
+        model_to_load = model
+        if not hasattr(model, cls.base_model_prefix) and any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+            start_prefix = cls.base_model_prefix + '.'
+        if hasattr(model, cls.base_model_prefix) and not any(s.startswith(cls.base_model_prefix) for s in state_dict.keys()):
+            model_to_load = getattr(model, cls.base_model_prefix)
+
+        load(model_to_load, prefix=start_prefix)
+        if len(missing_keys) > 0:
+            logger.info("Weights of {} not initialized from pretrained model: {}".format(
+                model.__class__.__name__, missing_keys))
+        if len(unexpected_keys) > 0:
+            logger.info("Weights from pretrained model not used in {}: {}".format(
+                model.__class__.__name__, unexpected_keys))
+        if len(error_msgs) > 0:
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
+
+        if hasattr(model, 'tie_weights'):
+            model.tie_weights()  # make sure word embedding weights are still tied
+
+        if output_loading_info:
+            loading_info = {"missing_keys": missing_keys, "unexpected_keys": unexpected_keys, "error_msgs": error_msgs}
+            return model, loading_info
+
+        return model
+
+
+class Conv1D(nn.Module):
+    def __init__(self, nf, nx):
+        """ Conv1D layer as defined by Radford et al. for OpenAI GPT (and also used in GPT-2)
+            Basically works like a Linear layer but the weights are transposed
+        """
+        super(Conv1D, self).__init__()
+        self.nf = nf
+        w = torch.empty(nx, nf)
+        nn.init.normal_(w, std=0.02)
+        self.weight = nn.Parameter(w)
+        self.bias = nn.Parameter(torch.zeros(nf))
+
+    def forward(self, x):
+        size_out = x.size()[:-1] + (self.nf,)
+        x = torch.addmm(self.bias, x.view(-1, x.size(-1)), self.weight)
+        x = x.view(*size_out)
+        return x
+
+
+class PoolerStartLogits(nn.Module):
+    """ Compute SQuAD start_logits from sequence hidden states. """
+    def __init__(self, config):
+        super(PoolerStartLogits, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, hidden_states, p_mask=None):
+        """ Args:
+            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape `(batch_size, seq_len)`
+                invalid position mask such as query and special symbols (PAD, SEP, CLS)
+                1.0 means token should be masked.
+        """
+        x = self.dense(hidden_states).squeeze(-1)
+
+        if p_mask is not None:
+            x = x * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+class PoolerEndLogits(nn.Module):
+    """ Compute SQuAD end_logits from sequence hidden states and start token hidden state.
+    """
+    def __init__(self, config):
+        super(PoolerEndLogits, self).__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dense_1 = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, hidden_states, start_states=None, start_positions=None, p_mask=None):
+        """ Args:
+            One of ``start_states``, ``start_positions`` should be not None.
+            If both are set, ``start_positions`` overrides ``start_states``.
+
+            **start_states**: ``torch.LongTensor`` of shape identical to hidden_states
+                hidden states of the first tokens for the labeled span.
+            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+                position of the first token for the labeled span: 
+            **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
+                Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
+                1.0 means token should be masked.
+        """
+        assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
+        if start_positions is not None:
+            slen, hsz = hidden_states.shape[-2:]
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions) # shape (bsz, 1, hsz)
+            start_states = start_states.expand(-1, slen, -1) # shape (bsz, slen, hsz)
+
+        x = self.dense_0(torch.cat([hidden_states, start_states], dim=-1))
+        x = self.activation(x)
+        x = self.LayerNorm(x)
+        x = self.dense_1(x).squeeze(-1)
+
+        if p_mask is not None:
+            x = x * (1 - p_mask) - 1e30 * p_mask
+
+        return x
+
+
+class PoolerAnswerClass(nn.Module):
+    """ Compute SQuAD 2.0 answer class from classification and start tokens hidden states. """
+    def __init__(self, config):
+        super(PoolerAnswerClass, self).__init__()
+        self.dense_0 = nn.Linear(config.hidden_size * 2, config.hidden_size)
+        self.activation = nn.Tanh()
+        self.dense_1 = nn.Linear(config.hidden_size, 1, bias=False)
+
+    def forward(self, hidden_states, start_states=None, start_positions=None, cls_index=None):
+        """
+        Args:
+            One of ``start_states``, ``start_positions`` should be not None.
+            If both are set, ``start_positions`` overrides ``start_states``.
+
+            **start_states**: ``torch.LongTensor`` of shape identical to ``hidden_states``.
+                hidden states of the first tokens for the labeled span.
+            **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+                position of the first token for the labeled span.
+            **cls_index**: torch.LongTensor of shape ``(batch_size,)``
+                position of the CLS token. If None, take the last token.
+
+            note(Original repo):
+                no dependency on end_feature so that we can obtain one single `cls_logits`
+                for each sample
+        """
+        hsz = hidden_states.shape[-1]
+        assert start_states is not None or start_positions is not None, "One of start_states, start_positions should be not None"
+        if start_positions is not None:
+            start_positions = start_positions[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
+            start_states = hidden_states.gather(-2, start_positions).squeeze(-2) # shape (bsz, hsz)
+
+        if cls_index is not None:
+            cls_index = cls_index[:, None, None].expand(-1, -1, hsz) # shape (bsz, 1, hsz)
+            cls_token_state = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, hsz)
+        else:
+            cls_token_state = hidden_states[:, -1, :] # shape (bsz, hsz)
+
+        x = self.dense_0(torch.cat([start_states, cls_token_state], dim=-1))
+        x = self.activation(x)
+        x = self.dense_1(x).squeeze(-1)
+
+        return x
+
+
+class SQuADHead(nn.Module):
+    r""" A SQuAD head inspired by XLNet.
+
+    Parameters:
+        config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+
+    Inputs:
+        **hidden_states**: ``torch.FloatTensor`` of shape ``(batch_size, seq_len, hidden_size)``
+            hidden states of sequence tokens
+        **start_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+            position of the first token for the labeled span.
+        **end_positions**: ``torch.LongTensor`` of shape ``(batch_size,)``
+            position of the last token for the labeled span.
+        **cls_index**: torch.LongTensor of shape ``(batch_size,)``
+            position of the CLS token. If None, take the last token.
+        **is_impossible**: ``torch.LongTensor`` of shape ``(batch_size,)``
+            Whether the question has a possible answer in the paragraph or not.
+        **p_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, seq_len)``
+            Mask of invalid position such as query and special symbols (PAD, SEP, CLS)
+            1.0 means token should be masked.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned if both ``start_positions`` and ``end_positions`` are provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss as the sum of start token, end token (and is_impossible if provided) classification losses.
+        **start_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top)``
+            Log probabilities for the top config.start_n_top start token possibilities (beam-search).
+        **start_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top)``
+            Indices for the top config.start_n_top start token possibilities (beam-search).
+        **end_top_log_probs**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
+            Log probabilities for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+        **end_top_index**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.LongTensor`` of shape ``(batch_size, config.start_n_top * config.end_n_top)``
+            Indices for the top ``config.start_n_top * config.end_n_top`` end token possibilities (beam-search).
+        **cls_logits**: (`optional`, returned if ``start_positions`` or ``end_positions`` is not provided)
+            ``torch.FloatTensor`` of shape ``(batch_size,)``
+            Log probabilities for the ``is_impossible`` label of the answers.
+    """
+    def __init__(self, config):
+        super(SQuADHead, self).__init__()
+        self.start_n_top = config.start_n_top
+        self.end_n_top = config.end_n_top
+
+        self.start_logits = PoolerStartLogits(config)
+        self.end_logits = PoolerEndLogits(config)
+        self.answer_class = PoolerAnswerClass(config)
+
+    def forward(self, hidden_states, start_positions=None, end_positions=None,
+                cls_index=None, is_impossible=None, p_mask=None):
+        outputs = ()
+
+        start_logits = self.start_logits(hidden_states, p_mask=p_mask)
+
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, let's remove the dimension added by batch splitting
+            for x in (start_positions, end_positions, cls_index, is_impossible):
+                if x is not None and x.dim() > 1:
+                    x.squeeze_(-1)
+
+            # during training, compute the end logits based on the ground truth of the start position
+            end_logits = self.end_logits(hidden_states, start_positions=start_positions, p_mask=p_mask)
+
+            loss_fct = CrossEntropyLoss()
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+            if cls_index is not None and is_impossible is not None:
+                # Predict answerability from the representation of CLS and START
+                cls_logits = self.answer_class(hidden_states, start_positions=start_positions, cls_index=cls_index)
+                loss_fct_cls = nn.BCEWithLogitsLoss()
+                cls_loss = loss_fct_cls(cls_logits, is_impossible)
+
+                # note(zhiliny): by default multiply the loss by 0.5 so that the scale is comparable to start_loss and end_loss
+                total_loss += cls_loss * 0.5
+
+            outputs = (total_loss,) + outputs
+
+        else:
+            # during inference, compute the end logits based on beam search
+            bsz, slen, hsz = hidden_states.size()
+            start_log_probs = F.softmax(start_logits, dim=-1) # shape (bsz, slen)
+
+            start_top_log_probs, start_top_index = torch.topk(start_log_probs, self.start_n_top, dim=-1) # shape (bsz, start_n_top)
+            start_top_index_exp = start_top_index.unsqueeze(-1).expand(-1, -1, hsz) # shape (bsz, start_n_top, hsz)
+            start_states = torch.gather(hidden_states, -2, start_top_index_exp) # shape (bsz, start_n_top, hsz)
+            start_states = start_states.unsqueeze(1).expand(-1, slen, -1, -1) # shape (bsz, slen, start_n_top, hsz)
+
+            hidden_states_expanded = hidden_states.unsqueeze(2).expand_as(start_states) # shape (bsz, slen, start_n_top, hsz)
+            p_mask = p_mask.unsqueeze(-1) if p_mask is not None else None
+            end_logits = self.end_logits(hidden_states_expanded, start_states=start_states, p_mask=p_mask)
+            end_log_probs = F.softmax(end_logits, dim=1) # shape (bsz, slen, start_n_top)
+
+            end_top_log_probs, end_top_index = torch.topk(end_log_probs, self.end_n_top, dim=1) # shape (bsz, end_n_top, start_n_top)
+            end_top_log_probs = end_top_log_probs.view(-1, self.start_n_top * self.end_n_top)
+            end_top_index = end_top_index.view(-1, self.start_n_top * self.end_n_top)
+
+            start_states = torch.einsum("blh,bl->bh", hidden_states, start_log_probs)
+            cls_logits = self.answer_class(hidden_states, start_states=start_states, cls_index=cls_index)
+
+            outputs = (start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits) + outputs
+
+        # return start_top_log_probs, start_top_index, end_top_log_probs, end_top_index, cls_logits
+        # or (if labels are provided) (total_loss,)
+        return outputs
+
+
+class SequenceSummary(nn.Module):
+    r""" Compute a single vector summary of a sequence hidden states according to various possibilities:
+        Args of the config class:
+            summary_type:
+                - 'last' => [default] take the last token hidden state (like XLNet)
+                - 'first' => take the first token hidden state (like Bert)
+                - 'mean' => take the mean of all tokens hidden states
+                - 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2)
+                - 'attn' => Not implemented now, use multi-head attention
+            summary_use_proj: Add a projection after the vector extraction
+            summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
+            summary_activation: 'tanh' => add a tanh activation to the output, Other => no activation. Default 
+            summary_first_dropout: Add a dropout before the projection and activation
+            summary_last_dropout: Add a dropout after the projection and activation
+    """
+    def __init__(self, config):
+        super(SequenceSummary, self).__init__()
+
+        self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
+        if config.summary_type == 'attn':
+            # We should use a standard multi-head attention module with absolute positional embedding for that.
+            # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
+            # We can probably just use the multi-head attention module of PyTorch >=1.1.0
+            raise NotImplementedError
+
+        self.summary = nn.Identity()
+        if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
+            if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
+                num_classes = config.num_labels
+            else:
+                num_classes = config.hidden_size
+            self.summary = nn.Linear(config.hidden_size, num_classes)
+
+        self.activation = nn.Identity()
+        if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
+            self.activation = nn.Tanh()
+
+        self.first_dropout = nn.Identity()
+        if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
+            self.first_dropout = nn.Dropout(config.summary_first_dropout)
+
+        self.last_dropout = nn.Identity()
+        if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
+            self.last_dropout = nn.Dropout(config.summary_last_dropout)
+
+    def forward(self, hidden_states, token_ids=None):
+        """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
+            token_ids: [optional] index of the classification token if summary_type == 'token_ids',
+                shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
+                if summary_type == 'token_ids' and token_ids is None:
+                    we take the last token of the sequence as classification token
+        """
+        if self.summary_type == 'last':
+            output = hidden_states[:, -1]
+        elif self.summary_type == 'first':
+            output = hidden_states[:, 0]
+        elif self.summary_type == 'mean':
+            output = hidden_states.mean(dim=1)
+        elif self.summary_type == 'token_ids':
+            if token_ids is None:
+                token_ids = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
+            else:
+                token_ids = token_ids.unsqueeze(-1).unsqueeze(-1)
+                token_ids = token_ids.expand((-1,) * (token_ids.dim()-1) + (hidden_states.size(-1),))
+            # shape of token_ids: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, token_ids).squeeze(-2) # shape (bsz, XX, hidden_size)
+        elif self.summary_type == 'attn':
+            raise NotImplementedError
+
+        output = self.first_dropout(output)
+        output = self.summary(output)
+        output = self.activation(output)
+        output = self.last_dropout(output)
+
+        return output
+
+
+def prune_linear_layer(layer, index, dim=0):
+    """ Prune a linear layer (a model parameters) to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if layer.bias is not None:
+        if dim == 1:
+            b = layer.bias.clone().detach()
+        else:
+            b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = nn.Linear(new_size[1], new_size[0], bias=layer.bias is not None).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    if layer.bias is not None:
+        new_layer.bias.requires_grad = False
+        new_layer.bias.copy_(b.contiguous())
+        new_layer.bias.requires_grad = True
+    return new_layer
+
+
+def prune_conv1d_layer(layer, index, dim=1):
+    """ Prune a Conv1D layer (a model parameters) to keep only entries in index.
+        A Conv1D work as a Linear layer (see e.g. BERT) but the weights are transposed.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    index = index.to(layer.weight.device)
+    W = layer.weight.index_select(dim, index).clone().detach()
+    if dim == 0:
+        b = layer.bias.clone().detach()
+    else:
+        b = layer.bias[index].clone().detach()
+    new_size = list(layer.weight.size())
+    new_size[dim] = len(index)
+    new_layer = Conv1D(new_size[1], new_size[0]).to(layer.weight.device)
+    new_layer.weight.requires_grad = False
+    new_layer.weight.copy_(W.contiguous())
+    new_layer.weight.requires_grad = True
+    new_layer.bias.requires_grad = False
+    new_layer.bias.copy_(b.contiguous())
+    new_layer.bias.requires_grad = True
+    return new_layer
+
+
+def prune_layer(layer, index, dim=None):
+    """ Prune a Conv1D or nn.Linear layer (a model parameters) to keep only entries in index.
+        Return the pruned layer as a new layer with requires_grad=True.
+        Used to remove heads.
+    """
+    if isinstance(layer, nn.Linear):
+        return prune_linear_layer(layer, index, dim=0 if dim is None else dim)
+    elif isinstance(layer, Conv1D):
+        return prune_conv1d_layer(layer, index, dim=1 if dim is None else dim)
+    else:
+        raise ValueError("Can't prune layer of class {}".format(layer.__class__))
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -0,0 +1,921 @@
+# coding=utf-8
+# Copyright 2019-present, Facebook, Inc and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch XLM model.
+"""
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import math
+import sys
+from io import open
+
+import itertools
+import numpy as np
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from .modeling_utils import (PretrainedConfig, PreTrainedModel, add_start_docstrings,
+                             prune_linear_layer, SequenceSummary, SQuADHead)
+
+logger = logging.getLogger(__name__)
+
+XLM_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-pytorch_model.bin",
+    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-pytorch_model.bin",
+    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-pytorch_model.bin",
+    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-pytorch_model.bin",
+    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-pytorch_model.bin",
+    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-pytorch_model.bin",
+    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-pytorch_model.bin",
+    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-pytorch_model.bin",
+}
+XLM_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'xlm-mlm-en-2048': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-en-2048-config.json",
+    'xlm-mlm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-ende-1024-config.json",
+    'xlm-mlm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enfr-1024-configl.json",
+    'xlm-mlm-enro-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-enro-1024-config.json",
+    'xlm-mlm-tlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-tlm-xnli15-1024-config.json",
+    'xlm-mlm-xnli15-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-mlm-xnli15-1024-config.json",
+    'xlm-clm-enfr-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-enfr-1024-config.json",
+    'xlm-clm-ende-1024': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-clm-ende-1024-config.json",
+}
+
+
+class XLMConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a `XLMModel`.
+
+    Args:
+        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
+        d_model: Size of the encoder layers and the pooler layer.
+        n_layer: Number of hidden layers in the Transformer encoder.
+        n_head: Number of attention heads for each attention layer in
+            the Transformer encoder.
+        d_inner: The size of the "intermediate" (i.e., feed-forward)
+            layer in the Transformer encoder.
+        ff_activation: The non-linear activation function (function or string) in the
+            encoder and pooler. If string, "gelu", "relu" and "swish" are supported.
+        untie_r: untie relative position biases
+        attn_type: 'bi' for XLM, 'uni' for Transformer-XL
+
+        dropout: The dropout probabilitiy for all fully connected
+            layers in the embeddings, encoder, and pooler.
+        dropatt: The dropout ratio for the attention
+            probabilities.
+        max_position_embeddings: The maximum sequence length that this model might
+            ever be used with. Typically set this to something large just in case
+            (e.g., 512 or 1024 or 2048).
+        initializer_range: The sttdev of the truncated_normal_initializer for
+            initializing all weight matrices.
+        layer_norm_eps: The epsilon used by LayerNorm.
+
+        dropout: float, dropout rate.
+        dropatt: float, dropout rate on attention probabilities.
+        init: str, the initialization scheme, either "normal" or "uniform".
+        init_range: float, initialize the parameters with a uniform distribution
+            in [-init_range, init_range]. Only effective when init="uniform".
+        init_std: float, initialize the parameters with a normal distribution
+            with mean 0 and stddev init_std. Only effective when init="normal".
+        mem_len: int, the number of tokens to cache.
+        reuse_len: int, the number of tokens in the currect batch to be cached
+            and reused in the future.
+        bi_data: bool, whether to use bidirectional input pipeline.
+            Usually set to True during pretraining and False during finetuning.
+        clamp_len: int, clamp all relative distances larger than clamp_len.
+            -1 means no clamping.
+        same_length: bool, whether to use the same attention length for each token.
+    """
+    pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=30145,
+                 emb_dim=2048,
+                 n_layers=12,
+                 n_heads=16,
+                 dropout=0.1,
+                 attention_dropout=0.1,
+                 gelu_activation=True,
+                 sinusoidal_embeddings=False,
+                 causal=False,
+                 asm=False,
+                 n_langs=1,
+                 max_position_embeddings=512,
+                 embed_init_std=2048 ** -0.5,
+                 layer_norm_eps=1e-12,
+                 init_std=0.02,
+                 bos_index=0,
+                 eos_index=1,
+                 pad_index=2,
+                 unk_index=3,
+                 mask_index=5,
+                 is_encoder=True,
+
+                 finetuning_task=None,
+                 num_labels=2,
+                 summary_type='first',
+                 summary_use_proj=True,
+                 summary_activation=None,
+                 summary_proj_to_labels=True,
+                 summary_first_dropout=0.1,
+                 start_n_top=5,
+                 end_n_top=5,
+                 **kwargs):
+        """Constructs XLMConfig.
+        """
+        super(XLMConfig, self).__init__(**kwargs)
+
+        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
+                        and isinstance(vocab_size_or_config_json_file, unicode)):
+            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif isinstance(vocab_size_or_config_json_file, int):
+            self.n_words = vocab_size_or_config_json_file
+            self.emb_dim = emb_dim
+            self.n_layers = n_layers
+            self.n_heads = n_heads
+            self.dropout = dropout
+            self.attention_dropout = attention_dropout
+            self.gelu_activation = gelu_activation
+            self.sinusoidal_embeddings = sinusoidal_embeddings
+            self.causal = causal
+            self.asm = asm
+            self.n_langs = n_langs
+            self.layer_norm_eps = layer_norm_eps
+            self.bos_index = bos_index
+            self.eos_index = eos_index
+            self.pad_index = pad_index
+            self.unk_index = unk_index
+            self.mask_index = mask_index
+            self.is_encoder = is_encoder
+            self.max_position_embeddings = max_position_embeddings
+            self.embed_init_std = embed_init_std
+            self.init_std = init_std
+            self.finetuning_task = finetuning_task
+            self.num_labels = num_labels
+            self.summary_type = summary_type
+            self.summary_use_proj = summary_use_proj
+            self.summary_activation = summary_activation
+            self.summary_proj_to_labels = summary_proj_to_labels
+            self.summary_first_dropout = summary_first_dropout
+            self.start_n_top = start_n_top
+            self.end_n_top = end_n_top
+        else:
+            raise ValueError("First argument must be either a vocabulary size (int)"
+                             "or the path to a pretrained model config file (str)")
+
+    @property
+    def vocab_size(self):
+        return self.n_words
+
+    @vocab_size.setter
+    def vocab_size(self, value):
+        self.n_words = value
+
+    @property
+    def hidden_size(self):
+        return self.emb_dim
+
+    @property
+    def num_attention_heads(self):
+        return self.n_heads
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layers
+
+
+def create_sinusoidal_embeddings(n_pos, dim, out):
+    position_enc = np.array([
+        [pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)]
+        for pos in range(n_pos)
+    ])
+    out[:, 0::2] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+    out[:, 1::2] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+    out.detach_()
+    out.requires_grad = False
+
+
+def gelu(x):
+    """
+    GELU activation
+    https://arxiv.org/abs/1606.08415
+    https://github.com/huggingface/pytorch-openai-transformer-lm/blob/master/model_pytorch.py#L14
+    https://github.com/huggingface/pytorch-transformers/blob/master/modeling.py
+    """
+    # return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
+    return 0.5 * x * (1.0 + torch.erf(x / math.sqrt(2.0)))
+
+
+def get_masks(slen, lengths, causal, padding_mask=None):
+    """
+    Generate hidden states mask, and optionally an attention mask.
+    """
+    bs = lengths.size(0)
+    if padding_mask is not None:
+        mask = padding_mask
+    else:
+        assert lengths.max().item() <= slen
+        alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
+        mask = alen < lengths[:, None]
+
+    # attention mask is the same as mask, or triangular inferior attention (causal)
+    if causal:
+        attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
+    else:
+        attn_mask = mask
+
+    # sanity check
+    assert mask.size() == (bs, slen)
+    assert causal is False or attn_mask.size() == (bs, slen, slen)
+
+    return mask, attn_mask
+
+
+class MultiHeadAttention(nn.Module):
+
+    NEW_ID = itertools.count()
+
+    def __init__(self, n_heads, dim, config):
+        super(MultiHeadAttention, self).__init__()
+        self.layer_id = next(MultiHeadAttention.NEW_ID)
+        self.output_attentions = config.output_attentions
+        self.dim = dim
+        self.n_heads = n_heads
+        self.dropout = config.attention_dropout
+        assert self.dim % self.n_heads == 0
+
+        self.q_lin = nn.Linear(dim, dim)
+        self.k_lin = nn.Linear(dim, dim)
+        self.v_lin = nn.Linear(dim, dim)
+        self.out_lin = nn.Linear(dim, dim)
+
+    def prune_heads(self, heads):
+        attention_head_size = self.dim // self.n_heads
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.n_heads, attention_head_size)
+        for head in heads:
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        # Prune linear layers
+        self.q_lin = prune_linear_layer(self.q_lin, index)
+        self.k_lin = prune_linear_layer(self.k_lin, index)
+        self.v_lin = prune_linear_layer(self.v_lin, index)
+        self.out_lin = prune_linear_layer(self.out_lin, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.dim = attention_head_size * self.n_heads
+
+    def forward(self, input, mask, kv=None, cache=None, head_mask=None):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = input.size()
+        if kv is None:
+            klen = qlen if cache is None else cache['slen'] + qlen
+        else:
+            klen = kv.size(1)
+        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
+        n_heads = self.n_heads
+        dim_per_head = self.dim // n_heads
+        mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)
+
+        def shape(x):
+            """  projection """
+            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
+
+        def unshape(x):
+            """  compute context """
+            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
+
+        q = shape(self.q_lin(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        if kv is None:
+            k = shape(self.k_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(input))                                      # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k_lin(k))                                          # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v_lin(v))                                          # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = torch.cat([k_, k], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                    v = torch.cat([v_, v], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+            cache[self.layer_id] = (k, v)
+
+        q = q / math.sqrt(dim_per_head)                                       # (bs, n_heads, qlen, dim_per_head)
+        scores = torch.matmul(q, k.transpose(2, 3))                           # (bs, n_heads, qlen, klen)
+        mask = (mask == 0).view(mask_reshape).expand_as(scores)               # (bs, n_heads, qlen, klen)
+        scores.masked_fill_(mask, -float('inf'))                              # (bs, n_heads, qlen, klen)
+
+        weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
+        weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = torch.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)                                            # (bs, qlen, dim)
+
+        outputs = (self.out_lin(context),)
+        if self.output_attentions:
+            outputs = outputs + (weights,)
+        return outputs
+
+
+class TransformerFFN(nn.Module):
+
+    def __init__(self, in_dim, dim_hidden, out_dim, config):
+        super(TransformerFFN, self).__init__()
+        self.dropout = config.dropout
+        self.lin1 = nn.Linear(in_dim, dim_hidden)
+        self.lin2 = nn.Linear(dim_hidden, out_dim)
+        self.act = gelu if config.gelu_activation else F.relu
+
+    def forward(self, input):
+        x = self.lin1(input)
+        x = self.act(x)
+        x = self.lin2(x)
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        return x
+
+
+class XLMPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = XLMConfig
+    pretrained_model_archive_map = XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = None
+    base_model_prefix = "transformer"
+
+    def __init__(self, *inputs, **kwargs):
+        super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
+
+    def init_weights(self, module):
+        """ Initialize the weights. """
+        if isinstance(module, nn.Embedding):
+            if self.config is not None and self.config.embed_init_std is not None:
+                nn.init.normal_(module.weight, mean=0, std=self.config.embed_init_std)
+        if isinstance(module, nn.Linear):
+            if self.config is not None and self.config.init_std is not None:
+                nn.init.normal_(module.weight, mean=0, std=self.config.init_std)
+                if hasattr(module, 'bias') and module.bias is not None:
+                    nn.init.constant_(module.bias, 0.)
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+XLM_START_DOCSTRING = r"""    The XLM model was proposed in
+    `Cross-lingual Language Model Pretraining`_
+    by Guillaume Lample*, Alexis Conneau*. It's a transformer pre-trained using one of the following objectives:
+
+        - a causal language modeling (CLM) objective (next token prediction),
+        - a masked language modeling (MLM) objective (Bert-like), or
+        - a Translation Language Modeling (TLM) object (extension of Bert's MLM to multiple language inputs)
+
+    Original code can be found `here`_.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Cross-lingual Language Model Pretraining`:
+        https://arxiv.org/abs/1901.07291
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    .. _`here`:
+        https://github.com/facebookresearch/XLM
+
+    Parameters:
+        config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model.
+"""
+
+XLM_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens (can be used to indicate various portions of the inputs).
+            The embeddings from these tokens will be summed with the respective token embeddings.
+            Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
+        **langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            A parallel sequence of tokens to be used to indicate the language of each token in the input.
+            Indices are selected in the pre-trained language vocabulary,
+            i.e. in the range ``[0, config.n_langs - 1[``.
+        **attention_mask**: (`optional`) ``torch.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **lengths**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Length of each sentence that can be used to avoid performing attention on padding token indices.
+            You can also use `attention_mask` for the same result (see above), kept here for compatbility.
+            Indices selected in ``[0, ..., input_ids.size(-1)]``:
+        **cache**:
+            dictionary with ``torch.FloatTensor`` that contains pre-computed
+            hidden-states (key and values in the attention blocks) as computed by the model
+            (see `cache` output below). Can be used to speed up sequential decoding.
+            The dictionary object will be modified in-place during the forward pass to add newly computed hidden-states.
+        **head_mask**: (`optional`) ``torch.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare XLM Model transformer outputing raw hidden-states without any specific head on top.",
+                      XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+class XLMModel(XLMPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the last layer of the model.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+
+    Examples::
+
+        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> model = XLMModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    ATTRIBUTES = ['encoder', 'eos_index', 'pad_index',  # 'with_output', 
+                  'n_langs', 'n_words', 'dim', 'n_layers', 'n_heads', 
+                  'hidden_dim', 'dropout', 'attention_dropout', 'asm',
+                  'asm_cutoffs', 'asm_div_value']
+
+    def __init__(self, config):  #, dico, is_encoder, with_output):
+        super(XLMModel, self).__init__(config)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+
+        # encoder / decoder, output layer
+        self.is_encoder = config.is_encoder
+        self.is_decoder = not config.is_encoder
+        if self.is_decoder:
+            raise NotImplementedError("Currently XLM can only be used as an encoder")
+        # self.with_output = with_output
+        self.causal = config.causal
+
+        # dictionary / languages
+        self.n_langs = config.n_langs
+        self.n_words = config.n_words
+        self.eos_index = config.eos_index
+        self.pad_index = config.pad_index
+        # self.dico = dico
+        # self.id2lang = config.id2lang
+        # self.lang2id = config.lang2id
+        # assert len(self.dico) == self.n_words
+        # assert len(self.id2lang) == len(self.lang2id) == self.n_langs
+
+        # model parameters
+        self.dim = config.emb_dim       # 512 by default
+        self.hidden_dim = self.dim * 4  # 2048 by default
+        self.n_heads = config.n_heads   # 8 by default
+        self.n_layers = config.n_layers
+        self.dropout = config.dropout
+        self.attention_dropout = config.attention_dropout
+        assert self.dim % self.n_heads == 0, 'transformer dim must be a multiple of n_heads'
+
+        # embeddings
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, self.dim)
+        if config.sinusoidal_embeddings:
+            create_sinusoidal_embeddings(config.max_position_embeddings, self.dim, out=self.position_embeddings.weight)
+        if config.n_langs > 1:
+            self.lang_embeddings = nn.Embedding(self.n_langs, self.dim)
+        self.embeddings = nn.Embedding(self.n_words, self.dim, padding_idx=self.pad_index)
+        self.layer_norm_emb = nn.LayerNorm(self.dim, eps=config.layer_norm_eps)
+
+        # transformer layers
+        self.attentions = nn.ModuleList()
+        self.layer_norm1 = nn.ModuleList()
+        self.ffns = nn.ModuleList()
+        self.layer_norm2 = nn.ModuleList()
+        # if self.is_decoder:
+        #     self.layer_norm15 = nn.ModuleList()
+        #     self.encoder_attn = nn.ModuleList()
+
+        for _ in range(self.n_layers):
+            self.attentions.append(MultiHeadAttention(self.n_heads, self.dim, config=config))
+            self.layer_norm1.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            # if self.is_decoder:
+            #     self.layer_norm15.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+            #     self.encoder_attn.append(MultiHeadAttention(self.n_heads, self.dim, dropout=self.attention_dropout))
+            self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
+            self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
+
+        self.apply(self.init_weights)
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        self.embeddings = self._get_resized_embeddings(self.embeddings, new_num_tokens)
+        return self.embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.attentions[layer].prune_heads(heads)
+
+    def forward(self, input_ids, lengths=None, position_ids=None, langs=None,
+                token_type_ids=None, attention_mask=None, cache=None, head_mask=None):  # src_enc=None, src_len=None, 
+        if lengths is None:
+            lengths = (input_ids != self.pad_index).sum(dim=1).long()
+        # mask = input_ids != self.pad_index
+
+        # check inputs
+        bs, slen = input_ids.size()
+        assert lengths.size(0) == bs
+        assert lengths.max().item() <= slen
+        # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
+        # assert (src_enc is None) == (src_len is None)
+        # if src_enc is not None:
+        #     assert self.is_decoder
+        #     assert src_enc.size(0) == bs
+
+        # generate masks
+        mask, attn_mask = get_masks(slen, lengths, self.causal, padding_mask=attention_mask)
+        # if self.is_decoder and src_enc is not None:
+        #     src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]
+
+        # position_ids
+        if position_ids is None:
+            position_ids = input_ids.new((slen,)).long()
+            position_ids = torch.arange(slen, out=position_ids).unsqueeze(0)
+        else:
+            assert position_ids.size() == (bs, slen)  # (slen, bs)
+            # position_ids = position_ids.transpose(0, 1)
+
+        # langs
+        if langs is not None:
+            assert langs.size() == (bs, slen)  # (slen, bs)
+            # langs = langs.transpose(0, 1)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x qlen x klen]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.n_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.n_layers
+
+        # do not recompute cached elements
+        if cache is not None:
+            _slen = slen - cache['slen']
+            input_ids = input_ids[:, -_slen:]
+            position_ids = position_ids[:, -_slen:]
+            if langs is not None:
+                langs = langs[:, -_slen:]
+            mask = mask[:, -_slen:]
+            attn_mask = attn_mask[:, -_slen:]
+
+        # embeddings
+        tensor = self.embeddings(input_ids)
+        tensor = tensor + self.position_embeddings(position_ids).expand_as(tensor)
+        if langs is not None:
+            tensor = tensor + self.lang_embeddings(langs)
+        if token_type_ids is not None:
+            tensor = tensor + self.embeddings(token_type_ids)
+        tensor = self.layer_norm_emb(tensor)
+        tensor = F.dropout(tensor, p=self.dropout, training=self.training)
+        tensor *= mask.unsqueeze(-1).to(tensor.dtype)
+
+        # transformer layers
+        hidden_states = ()
+        attentions = ()
+        for i in range(self.n_layers):
+            if self.output_hidden_states:
+                hidden_states = hidden_states + (tensor,)
+
+            # self attention
+            attn_outputs = self.attentions[i](tensor, attn_mask, cache=cache, head_mask=head_mask[i])
+            attn = attn_outputs[0]
+            if self.output_attentions:
+                attentions = attentions + (attn_outputs[1],)
+            attn = F.dropout(attn, p=self.dropout, training=self.training)
+            tensor = tensor + attn
+            tensor = self.layer_norm1[i](tensor)
+
+            # encoder attention (for decoder only)
+            # if self.is_decoder and src_enc is not None:
+            #     attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
+            #     attn = F.dropout(attn, p=self.dropout, training=self.training)
+            #     tensor = tensor + attn
+            #     tensor = self.layer_norm15[i](tensor)
+
+            # FFN
+            tensor = tensor + self.ffns[i](tensor)
+            tensor = self.layer_norm2[i](tensor)
+            tensor *= mask.unsqueeze(-1).to(tensor.dtype)
+
+        # Add last hidden state
+        if self.output_hidden_states:
+            hidden_states = hidden_states + (tensor,)
+
+        # update cache length
+        if cache is not None:
+            cache['slen'] += tensor.size(1)
+
+        # move back sequence length to dimension 0
+        # tensor = tensor.transpose(0, 1)
+
+        outputs = (tensor,)
+        if self.output_hidden_states:
+            outputs = outputs + (hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (attentions,)
+        return outputs  # outputs, (hidden_states), (attentions)
+
+
+class XLMPredLayer(nn.Module):
+    """
+    Prediction layer (cross_entropy or adaptive_softmax).
+    """
+    def __init__(self, config):
+        super(XLMPredLayer, self).__init__()
+        self.asm = config.asm
+        self.n_words = config.n_words
+        self.pad_index = config.pad_index
+        dim = config.emb_dim
+
+        if config.asm is False:
+            self.proj = nn.Linear(dim, config.n_words, bias=True)
+        else:
+            self.proj = nn.AdaptiveLogSoftmaxWithLoss(
+                in_features=dim,
+                n_classes=config.n_words,
+                cutoffs=config.asm_cutoffs,
+                div_value=config.asm_div_value,
+                head_bias=True,  # default is False
+            )
+
+    def forward(self, x, y=None):
+        """ Compute the loss, and optionally the scores.
+        """
+        outputs = ()
+        if self.asm is False:
+            scores = self.proj(x).view(-1, self.n_words)
+            outputs = (scores,) + outputs
+            if y is not None:
+                loss = F.cross_entropy(scores, y, reduction='elementwise_mean')
+                outputs = (loss,) + outputs
+        else:
+            scores = self.proj.log_prob(x)
+            outputs = (scores,) + outputs
+            if y is not None:
+                _, loss = self.proj(x, y)
+                outputs = (loss,) + outputs
+
+        return outputs
+
+
+@add_start_docstrings("""The XLM Model transformer with a language modeling head on top
+    (linear layer with weights tied to the input embeddings). """,
+    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+class XLMWithLMHeadModel(XLMPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for language modeling.
+            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Indices are selected in ``[-1, 0, ..., config.vocab_size]``
+            All labels set to ``-1`` are ignored (masked), the loss is only
+            computed for labels in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+
+    Examples::
+
+        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> model = XLMWithLMHeadModel(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config):
+        super(XLMWithLMHeadModel, self).__init__(config)
+        self.transformer = XLMModel(config)
+        self.pred_layer = XLMPredLayer(config)
+
+        self.apply(self.init_weights)
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the embeddings
+        """
+        self._tie_or_clone_weights(self.pred_layer.proj, self.transformer.embeddings)
+
+    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
+                attention_mask=None, cache=None, labels=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
+                                               token_type_ids=token_type_ids, langs=langs,
+                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+
+        output = transformer_outputs[0]
+        outputs = self.pred_layer(output, labels)
+        outputs = outputs + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+
+        return outputs
+
+
+@add_start_docstrings("""XLM Model with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+class XLMForSequenceClassification(XLMPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+
+    Examples::
+
+        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> 
+        >>> model = XLMForSequenceClassification(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=labels)
+        >>> loss, logits = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(XLMForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = XLMModel(config)
+        self.sequence_summary = SequenceSummary(config)
+
+        self.apply(self.init_weights)
+
+    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
+                attention_mask=None, cache=None, labels=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
+                                               token_type_ids=token_type_ids, langs=langs,
+                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output)
+
+        outputs = (logits,) + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs
+
+
+@add_start_docstrings("""XLM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XLM_START_DOCSTRING, XLM_INPUTS_DOCSTRING)
+class XLMForQuestionAnswering(XLMPreTrainedModel):
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **is_impossible**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels whether a question has an answer or no answer (SQuAD 2.0)
+        **cls_index**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the classification token to use as input for computing plausibility of the answer.
+        **p_mask**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Optional mask of tokens which can't be in answers (e.g. [CLS], [PAD], ...) 
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+
+    Examples::
+
+        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> 
+        >>> model = XLMForQuestionAnswering(config)
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        >>> loss, start_scores, end_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(XLMForQuestionAnswering, self).__init__(config)
+
+        self.transformer = XLMModel(config)
+        self.qa_outputs = SQuADHead(config)
+
+        self.apply(self.init_weights)
+
+    def forward(self, input_ids, lengths=None, position_ids=None, langs=None, token_type_ids=None,
+                attention_mask=None, cache=None, start_positions=None, end_positions=None,
+                cls_index=None, is_impossible=None, p_mask=None, head_mask=None):
+        transformer_outputs = self.transformer(input_ids, lengths=lengths, position_ids=position_ids,
+                                               token_type_ids=token_type_ids, langs=langs,
+                                               attention_mask=attention_mask, cache=cache, head_mask=head_mask)
+
+        output = transformer_outputs[0]
+
+        outputs = self.qa_outputs(output, start_positions=start_positions, end_positions=end_positions,
+                                  cls_index=cls_index, is_impossible=is_impossible, p_mask=p_mask)
+
+        outputs = outputs + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+
+        return outputs
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
--- a/pytorch_transformers/optimization.py
+++ b/pytorch_transformers/optimization.py
@@ -0,0 +1,183 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch optimization for BERT model."""
+
+import logging
+import math
+
+import torch
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import LambdaLR
+
+logger = logging.getLogger(__name__)
+
+class ConstantLRSchedule(LambdaLR):
+    """ Constant learning rate schedule.
+    """
+    def __init__(self, optimizer, last_epoch=-1):
+        super(ConstantLRSchedule, self).__init__(optimizer, lambda _: 1.0, last_epoch=last_epoch)
+
+
+class WarmupConstantSchedule(LambdaLR):
+    """ Linear warmup and then constant.
+        Linearly increases learning rate schedule from 0 to 1 over `warmup_steps` training steps.
+        Keeps learning rate schedule equal to 1. after warmup_steps.
+    """
+    def __init__(self, optimizer, warmup_steps, last_epoch=-1):
+
+        def lr_lambda(step):
+            if step < warmup_steps:
+                return float(step) / float(max(1.0, warmup_steps))
+            return 1.
+
+        super(WarmupConstantSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
+
+
+class WarmupLinearSchedule(LambdaLR):
+    """ Linear warmup and then linear decay.
+        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
+        Linearly decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps.
+    """
+    def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
+
+        def lr_lambda(step):
+            if step < warmup_steps:
+                return float(step) / float(max(1, warmup_steps))
+            return max(0.0, float(t_total - step) / float(max(1.0, t_total - warmup_steps)))
+
+        super(WarmupLinearSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
+
+
+class WarmupCosineSchedule(LambdaLR):
+    """ Linear warmup and then cosine decay.
+        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
+        Decreases learning rate from 1. to 0. over remaining `t_total - warmup_steps` steps following a cosine curve.
+        If `cycles` (default=0.5) is different from default, learning rate follows cosine function after warmup.
+    """
+    warn_t_total = True
+    def __init__(self, optimizer, warmup_steps, t_total, cycles=.5, last_epoch=-1):
+
+        def lr_lambda(step):
+            if step < warmup_steps:
+                return float(step) / float(max(1.0, warmup_steps))
+            else:
+                progress = float(step - warmup_steps) / float(max(1, t_total - warmup_steps))   # progress after warmup
+                return max(0.0, 0.5 * (1. + math.cos(math.pi * float(cycles) * 2.0 * progress)))
+
+        super(WarmupCosineSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
+
+class WarmupCosineWithHardRestartsSchedule(LambdaLR):
+    """ Linear warmup and then cosine cycles with hard restarts.
+        Linearly increases learning rate from 0 to 1 over `warmup_steps` training steps.
+        If `cycles` (default=1.) is different from default, learning rate follows `cycles` times a cosine decaying
+        learning rate (with hard restarts).
+    """
+    def __init__(self, optimizer, warmup_steps, t_total, cycles=1., last_epoch=-1):
+
+        def lr_lambda(step):
+            if step < warmup_steps:
+                return float(step) / float(max(1, warmup_steps))
+            else:
+                progress = float(step - warmup_steps) / float(max(1, t_total - warmup_steps))   # progress after warmup
+                if progress >= 1.0:
+                    return 0.0
+                return max(0.0, 0.5 * (1. + math.cos(math.pi * ((float(cycles) * progress) % 1.0))))
+
+        super(WarmupCosineWithHardRestartsSchedule, self).__init__(optimizer, lr_lambda, last_epoch=last_epoch)
+
+
+class AdamW(Optimizer):
+    """ Implements Adam algorithm with weight decay fix.
+
+    Parameters:
+        lr (float): learning rate. Default 1e-3.
+        betas (tuple of 2 floats): Adams beta parameters (b1, b2). Default: (0.9, 0.999)
+        eps (float): Adams epsilon. Default: 1e-6
+        weight_decay (float): Weight decay. Default: 0.0
+        correct_bias (bool): can be set to False to avoid correcting bias in Adam (e.g. like in Bert TF repository). Default True.
+    """
+    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6, weight_decay=0.0, correct_bias=True):
+        if lr < 0.0:
+            raise ValueError("Invalid learning rate: {} - should be >= 0.0".format(lr))
+        if not 0.0 <= betas[0] < 1.0:
+            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[0]))
+        if not 0.0 <= betas[1]  < 1.0:
+            raise ValueError("Invalid beta parameter: {} - should be in [0.0, 1.0[".format(betas[1]))
+        if not 0.0 <= eps:
+            raise ValueError("Invalid epsilon value: {} - should be >= 0.0".format(eps))
+        defaults = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay,
+                        correct_bias=correct_bias)
+        super(AdamW, self).__init__(params, defaults)
+
+    def step(self, closure=None):
+        """Performs a single optimization step.
+
+        Arguments:
+            closure (callable, optional): A closure that reevaluates the model
+                and returns the loss.
+        """
+        loss = None
+        if closure is not None:
+            loss = closure()
+
+        for group in self.param_groups:
+            for p in group['params']:
+                if p.grad is None:
+                    continue
+                grad = p.grad.data
+                if grad.is_sparse:
+                    raise RuntimeError('Adam does not support sparse gradients, please consider SparseAdam instead')
+
+                state = self.state[p]
+
+                # State initialization
+                if len(state) == 0:
+                    state['step'] = 0
+                    # Exponential moving average of gradient values
+                    state['exp_avg'] = torch.zeros_like(p.data)
+                    # Exponential moving average of squared gradient values
+                    state['exp_avg_sq'] = torch.zeros_like(p.data)
+
+                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
+                beta1, beta2 = group['betas']
+
+                state['step'] += 1
+
+                # Decay the first and second moment running average coefficient
+                # In-place operations to update the averages at the same time
+                exp_avg.mul_(beta1).add_(1.0 - beta1, grad)
+                exp_avg_sq.mul_(beta2).addcmul_(1.0 - beta2, grad, grad)
+                denom = exp_avg_sq.sqrt().add_(group['eps'])
+
+                step_size = group['lr']
+                if group['correct_bias']:  # No bias correction for Bert
+                    bias_correction1 = 1.0 - beta1 ** state['step']
+                    bias_correction2 = 1.0 - beta2 ** state['step']
+                    step_size = step_size * math.sqrt(bias_correction2) / bias_correction1
+
+                p.data.addcdiv_(-step_size, exp_avg, denom)
+
+                # Just adding the square of the weights to the loss function is *not*
+                # the correct way of using L2 regularization/weight decay with Adam,
+                # since that will interact with the m and v parameters in strange ways.
+                #
+                # Instead we want to decay the weights in a manner that doesn't interact
+                # with the m/v parameters. This is equivalent to adding the square
+                # of the weights to the loss with plain (non-momentum) SGD.
+                # Add weight decay at the end (fixed version)
+                if group['weight_decay'] > 0.0:
+                    p.data.add_(-group['lr'] * group['weight_decay'], p.data)
+
+        return loss
--- a/pytorch_transformers/tests/init.py
+++ b/pytorch_transformers/tests/init.py
--- a/Show More
+++ b/Show More