From 117ed92992a8b7ec45b399a2b5e2f9b66358a7d4 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 16 Jul 2019 11:58:47 -0400
Subject: [PATCH 01/35] RestructuredText table for pretrained models.

---
 docs/source/pretrained_models.rst | 147 +++++++++++++++++++-----------
 1 file changed, 94 insertions(+), 53 deletions(-)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index 2d72977951..e4ad7a6eaa 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -3,57 +3,98 @@ Pretrained models
 
 Here is the full list of the currently provided pretrained models together with a short presentation of each model.
 
-+===============+============================================================+===========================+ 
-| Architecture  | Shortcut name                                              | Details of the model      |
-+===============+============================================================+===========================+ 
-|               | ``bert-base-uncased``                                      | 12-layer, 768-hidden, 12-heads, 110M parameters
-|               |                                                            | Trained on lower-cased English text                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-large-uncased``                                     | 24-layer, 1024-hidden, 16-heads, 340M parameters
-|               |                                                            | Trained on lower-cased English text                  |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-base-cased``                                        | 12-layer, 768-hidden, 12-heads, 110M parameters
-|               |                                                            | Trained on cased English text                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-large-cased``                                       | 24-layer, 1024-hidden, 16-heads, 340M parameters                  |
-|               |                                                            | Trained on cased English text                  |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-base-multilingual-uncased``                         | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters
-|               |                                                            | Trained on lower-cased text in the top 102 languages with the largest Wikipedias
-|               |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`_)                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-base-multilingual-cased``                           | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters                  |
-|               |                                                            | Trained on cased text in the top 104 languages with the largest Wikipedias
-|               |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`_)                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|    BERT       | ``bert-base-chinese``                                      | 12-layer, 768-hidden, 12-heads, 110M parameters                  |
-|               |                                                            | Trained on cased Chinese Simplified and Traditional text |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-base-german-cased``                                 | 12-layer, 768-hidden, 12-heads, 110M parameters                  |
-|               |                                                            | Trained on cased German text by Deepset.ai |
-|               |                                                            | (see `details on deepset.ai website <https://deepset.ai/german-bert>`_)                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-large-uncased-whole-word-masking``                  | 24-layer, 1024-hidden, 16-heads, 340M parameters                  |
-|               |                                                            | Trained on lower-cased English text using Whole-Word-Masking                  |
-|               |                                                            | (see `details <https://github.com/google-research/bert/#bert>`_)                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-large-cased-whole-word-masking``                    | 24-layer, 1024-hidden, 16-heads, 340M parameters                  |
-|               |                                                            | Trained on cased English text using Whole-Word-Masking                  |
-|               |                                                            | (see `details <https://github.com/google-research/bert/#bert>`_)                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | 24-layer, 1024-hidden, 16-heads, 340M parameters                  |
-|               |                                                            | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                  |
-|               |                                                            | (see details of fine-tuning in the `example section`_)                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-large-cased-whole-word-masking-finetuned-squad``    | 24-layer, 1024-hidden, 16-heads, 340M parameters                  |
-|               |                                                            | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                  |
-|               |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`_)                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-base-cased-finetuned-mrpc``                         | 12-layer, 768-hidden, 12-heads, 110M parameters                  |
-|               |                                                            | The ``bert-base-cased`` model fine-tuned on MRPC                  |
-|               |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`_)                 |
-+---------------+------------------------------------------------------------+---------------------------+ 
-|    GPT        | Cells may span columns.                                                                |
-+---------------+----------------------------------------------------------------------------------------+ 
 
-.. <https://huggingface.co/pytorch-transformers/examples.html>`_
\ No newline at end of file
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+| Architecture      | Shortcut name                                              | Details of the model                                                                                                      |
++===================+============================================================+===========================================================================================================================+
+| BERT              | ``bert-base-uncased``                                      | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
+|                   |                                                            | Trained on lower-cased English text                                                                                       |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-uncased``                                     | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
+|                   |                                                            | Trained on lower-cased English text                                                                                       |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-cased``                                        | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
+|                   |                                                            | Trained on cased English text                                                                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-cased``                                       | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
+|                   |                                                            | Trained on cased English text                                                                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-multilingual-uncased``                         | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters                                               |
+|                   |                                                            | Trained on lower-cased text in the top 102 languages with the largest Wikipedias                                          |
+|                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__)                                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-multilingual-cased``                           | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters                                                    |
+|                   |                                                            | Trained on cased text in the top 104 languages with the largest Wikipedias                                                |
+|                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__)                                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-chinese``                                      | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
+|                   |                                                            | Trained on cased Chinese Simplified and Traditional text                                                                  |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-german-cased``                                 | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
+|                   |                                                            | Trained on cased German text by Deepset.ai                                                                                |
+|                   |                                                            | (see `details on deepset.ai website <https://deepset.ai/german-bert>`__)                                                  |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-uncased-whole-word-masking``                  | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
+|                   |                                                            | Trained on lower-cased English text using Whole-Word-Masking                                                              |
+|                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__)                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-cased-whole-word-masking``                    | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
+|                   |                                                            | Trained on cased English text using Whole-Word-Masking                                                                    |
+|                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__)                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
+|                   |                                                            | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                   |
+|                   |                                                            | (see details of fine-tuning in the `example section`__)                                                                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
+|                   |                                                            | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                     |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)       |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-cased-finetuned-mrpc``                         | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
+|                   |                                                            | The ``bert-base-cased`` model fine-tuned on MRPC                                                                          |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)       |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+| GPT               | ``openai-gpt``                                             | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
+|                   |                                                            | OpenAI GPT English model                                                                                                  |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+| GPT-2             | ``gpt2``                                                   | 12-layer, 768-hidden, 12-heads, 117M parameters                                                                           |
+|                   |                                                            | OpenAI GPT-2 English model                                                                                                |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``gpt2-medium``                                            | 24-layer, 1024-hidden, 16-heads, 345M parameters                                                                          |
+|                   |                                                            | OpenAI's Medium-sized GPT-2 English model                                                                                 |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+| Transformer-XL    | ``transfo-xl-wt103``                                       | 18-layer, 1024-hidden, 16-heads, 257M parameters                                                                          |
+|                   |                                                            | English model trained on wikitext-103                                                                                     |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+| XLNet             | ``xlnet-base-cased``                                       | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
+|                   |                                                            | XLNet English model                                                                                                       |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlnet-large-cased``                                      | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
+|                   |                                                            | XLNet Large English model                                                                                                 |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+| XLM               | ``xlm-mlm-en-2048``                                        | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM English model                                                                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-ende-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM English-German Multi-language model                                                                                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-enfr-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM English-French Multi-language model                                                                                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-enro-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM English-Romanian Multi-language model                                                                                 |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-xnli15-1024``                                    | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM Model pre-trained with MLM on the `15 XNLI languages<https://github.com/facebookresearch/XNLI>`__.                    |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-tlm-xnli15-1024``                                | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages<https://github.com/facebookresearch/XNLI>`__.              |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-clm-enfr-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM English model trained with CLM (Causal Language Modeling)                                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-clm-ende-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM English-German Multi-language model trained with CLM (Causal Language Modeling)                                       |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+
+.. <https://huggingface.co/pytorch-transformers/examples.html>`__
\ No newline at end of file

From 9d381e7be9fb97e09777fa66aa3e336ca132af70 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 17 Jul 2019 09:25:38 -0400
Subject: [PATCH 02/35] Fixed incorrect links in the PretrainedModel

---
 docs/source/pretrained_models.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index e4ad7a6eaa..b23a96ff7c 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -43,8 +43,8 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__)                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
-|                   |                                                            | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                   |
-|                   |                                                            | (see details of fine-tuning in the `example section`__)                                                                   |
+|                   |                                                            | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD (see details of fine-tuning in the                |
+|                   |                                                            | `example section <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`__)                           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
 |                   |                                                            | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                     |
@@ -85,10 +85,10 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | XLM English-Romanian Multi-language model                                                                                 |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-xnli15-1024``                                    | 12-layer, 1024-hidden, 8-heads                                                                                            |
-|                   |                                                            | XLM Model pre-trained with MLM on the `15 XNLI languages<https://github.com/facebookresearch/XNLI>`__.                    |
+|                   |                                                            | XLM Model pre-trained with MLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-tlm-xnli15-1024``                                | 12-layer, 1024-hidden, 8-heads                                                                                            |
-|                   |                                                            | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages<https://github.com/facebookresearch/XNLI>`__.              |
+|                   |                                                            | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-clm-enfr-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
 |                   |                                                            | XLM English model trained with CLM (Causal Language Modeling)                                                             |

From edfd965ac8a5446adb2c94ad043263b3144b3f95 Mon Sep 17 00:00:00 2001
From: David Pollack <david@i2x.ai>
Date: Fri, 26 Jul 2019 14:13:46 +0200
Subject: [PATCH 03/35] fix convert_to_tf

---
 .../convert_pytorch_checkpoint_to_tf.py                | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
index b8858ee3dc..a2e7b5c41a 100644
--- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
+++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
@@ -72,11 +72,11 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
         return 'bert/{}'.format(name)
 
     def assign_tf_var(tensor:np.ndarray, name:str):
-        tmp_var = tf.Variable(initial_value=tensor)
-        tf_var = tf.get_variable(dtype=tmp_var.dtype, shape=tmp_var.shape, name=name)
-        op = tf.assign(ref=tf_var, value=tmp_var)
-        session.run(tf.variables_initializer([tmp_var, tf_var]))
-        session.run(fetches=[op, tf_var])
+        tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
+        tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name)
+        session.run(tf.variables_initializer([tf_var]))
+        tf.keras.backend.set_value(tf_var, tensor)
+        session.run(tf_var)
         return tf_var
 
     for var_name in state_dict:

From 09ecf225e9ac00f78ecf9246957128f5d7d79a52 Mon Sep 17 00:00:00 2001
From: David Pollack <david@i2x.ai>
Date: Fri, 26 Jul 2019 15:20:44 +0200
Subject: [PATCH 04/35] fixed the fix.  tf session madness.

---
 .../convert_pytorch_checkpoint_to_tf.py       | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
index a2e7b5c41a..c24dddc4d6 100644
--- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
+++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
@@ -62,34 +62,34 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
     if not os.path.isdir(ckpt_dir):
         os.makedirs(ckpt_dir)
 
-    session = tf.Session()
     state_dict = model.state_dict()
-    tf_vars = []
 
     def to_tf_var_name(name:str):
         for patt, repl in iter(var_map):
             name = name.replace(patt, repl)
         return 'bert/{}'.format(name)
 
-    def assign_tf_var(tensor:np.ndarray, name:str):
+    def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session):
         tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
-        tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name)
+        tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
         session.run(tf.variables_initializer([tf_var]))
-        tf.keras.backend.set_value(tf_var, tensor)
         session.run(tf_var)
         return tf_var
 
-    for var_name in state_dict:
-        tf_name = to_tf_var_name(var_name)
-        torch_tensor = state_dict[var_name].numpy()
-        if any([x in var_name for x in tensors_to_transopse]):
-            torch_tensor = torch_tensor.T
-        tf_tensor = assign_tf_var(tensor=torch_tensor, name=tf_name)
-        tf_vars.append(tf_tensor)
-        print("{0}{1}initialized".format(tf_name, " " * (60 - len(tf_name))))
+    tf.reset_default_graph()
+    with tf.Session() as session:
+        for var_name in state_dict:
+            tf_name = to_tf_var_name(var_name)
+            torch_tensor = state_dict[var_name].numpy()
+            if any([x in var_name for x in tensors_to_transopse]):
+                torch_tensor = torch_tensor.T
+            tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
+            tf.keras.backend.set_value(tf_var, torch_tensor)
+            tf_weight = session.run(tf_var)
+            print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
 
-    saver = tf.train.Saver(tf_vars)
-    saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
+        saver = tf.train.Saver(tf.trainable_variables())
+        saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
 
 
 def main(raw_args=None):

From ac42049c0877104632cff44d52ddb0a06d927ee0 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 26 Jul 2019 17:08:59 +0200
Subject: [PATCH 05/35] add auto models and auto tokenizer

---
 pytorch_transformers/modeling_auto.py     | 230 ++++++++++++++++++++++
 pytorch_transformers/tokenization_auto.py | 100 ++++++++++
 2 files changed, 330 insertions(+)
 create mode 100644 pytorch_transformers/modeling_auto.py
 create mode 100644 pytorch_transformers/tokenization_auto.py

diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
new file mode 100644
index 0000000000..68eb85cbd8
--- /dev/null
+++ b/pytorch_transformers/modeling_auto.py
@@ -0,0 +1,230 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Model class. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+
+from .modeling_bert import BertConfig, BertModel
+from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel
+from .modeling_gpt2 import GPT2Config, GPT2Model
+from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
+from .modeling_xlnet import XLNetConfig, XLNetModel
+from .modeling_xlm import XLMConfig, XLMModel
+
+logger = logging.getLogger(__name__)
+
+class AutoConfig(object):
+    r""":class:`~pytorch_transformers.AutoConfig` is a generic configuration class
+        that will be instantiated as one of the configuration classes of the library
+        when created with the `AutoConfig.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method take care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throw an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoConfig is designed to be instantiated "
+            "using the `AutoConfig.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r""" Instantiate a one of the configuration classes of the library
+        from a pre-trained model configuration.
+
+        The configuration class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a saved configuration `file`.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **return_unused_kwargs**: (`optional`) bool:
+                - If False, then this function returns just the final configuration object.
+                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs`
+                is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
+                ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
+            **kwargs**: (`optional`) dict:
+                Dictionary of key/value pairs with which to update the configuration object after loading.
+                - The values in kwargs of any keys which are configuration attributes will be used
+                to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
+                by the `return_unused_kwargs` keyword parameter.
+
+        Examples::
+
+            >>> config = AutoConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
+            >>> config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+            >>> assert config.output_attention == True
+            >>> config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
+            >>>                                                    foo=False, return_unused_kwargs=True)
+            >>> assert config.output_attention == True
+            >>> assert unused_kwargs == {'foo': False}
+
+        """
+        if 'bert' in pretrained_model_name_or_path:
+            return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm'".format(pretrained_model_name_or_path))
+
+
+class AutoModel(object):
+    r"""
+        :class:`~pytorch_transformers.AutoModel` is a generic model class
+        that will be instantiated as one of the base model classes of the library
+        when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method take care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throw an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoModel is designed to be instantiated "
+            "using the `AutoModel.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiate a one of the base model classes of the library
+        from a pre-trained model configuration.
+
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+
+            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are desactivated)
+            To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
+                    In this case, ``from_tf`` should be set to True and a configuration object should be
+                    provided as `config` argument. This loading option is slower than converting the TensorFlow
+                    checkpoint in a PyTorch model using the provided conversion scripts and loading
+                    the PyTorch model afterwards.
+            **model_args**: (`optional`) Sequence:
+                All remaning positional arguments will be passed to the underlying model's __init__ function
+            **config**: an optional configuration for the model to use instead of an automatically loaded configuation.
+                Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
+                - the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
+            **state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
+                from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuraton but load your own weights.
+                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
+                a simpler option.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **output_loading_info**: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+            **kwargs**: (`optional`) dict:
+                Dictionary of key, values to update the configuration object after loading.
+                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
+
+               - If a configuration is provided with `config`, **kwargs will be directly passed
+                 to the underlying model's __init__ method.
+               - If a configuration is not provided, **kwargs will be first passed to the pretrained
+                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
+                 Each key of **kwargs that corresponds to a configuration attribute
+                 will be used to override said attribute with the supplied **kwargs value.
+                 Remaining keys that do not correspond to any configuration attribute will
+                 be passed to the underlying model's __init__ function.
+
+        Examples::
+
+            >>> model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            >>> model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            >>> model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            >>> assert model.config.output_attention == True
+            >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            >>> config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            >>> model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'bert' in pretrained_model_name_or_path:
+            return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm'".format(pretrained_model_name_or_path))
+
diff --git a/pytorch_transformers/tokenization_auto.py b/pytorch_transformers/tokenization_auto.py
new file mode 100644
index 0000000000..66d0ce51ba
--- /dev/null
+++ b/pytorch_transformers/tokenization_auto.py
@@ -0,0 +1,100 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Model class. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+
+from .tokenization_bert import BertTokenizer
+from .tokenization_openai import OpenAIGPTTokenizer
+from .tokenization_gpt2 import GPT2Tokenizer
+from .tokenization_transfo_xl import TransfoXLTokenizer
+from .tokenization_xlnet import XLNetTokenizer
+from .tokenization_xlm import XLMTokenizer
+
+logger = logging.getLogger(__name__)
+
+class AutoTokenizer(object):
+    r""":class:`~pytorch_transformers.AutoTokenizer` is a generic tokenizer class
+        that will be instantiated as one of the tokenizer classes of the library
+        when created with the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method take care of returning the correct tokenizer class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The tokenizer class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertTokenizer (Bert model)
+            - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
+            - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
+            - contains `xlnet`: XLNetTokenizer (XLNet model)
+            - contains `xlm`: XLMTokenizer (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throw an error).
+    """
+    def __init__(self):
+        raise EnvironmentError("AutoTokenizer is designed to be instantiated "
+            "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method.")
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
+        r""" Instantiate a one of the tokenizer classes of the library
+        from a pre-trained model vocabulary.
+
+        The tokenizer class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertTokenizer (Bert model)
+            - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
+            - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
+            - contains `xlnet`: XLNetTokenizer (XLNet model)
+            - contains `xlm`: XLMTokenizer (XLM model)
+
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a saved configuration `file`.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+        Examples::
+
+            >>> config = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
+            >>> config = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+
+        """
+        if 'bert' in pretrained_model_name_or_path:
+            return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'gpt2' in pretrained_model_name_or_path:
+            return GPT2Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            return TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'xlnet' in pretrained_model_name_or_path:
+            return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'xlm' in pretrained_model_name_or_path:
+            return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+
+        raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                         "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                         "'xlm'".format(pretrained_model_name_or_path))

From 57e54ec070258189695ba8cacdf7d2bcaf1c72bc Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 26 Jul 2019 17:09:07 +0200
Subject: [PATCH 06/35] add unk_token to gpt2

---
 pytorch_transformers/tokenization_gpt2.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index 43c57c9cd3..afcdf1e64e 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -102,7 +102,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, merges_file, errors='replace',
+    def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
                  bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
         super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, **kwargs)
 
@@ -177,9 +177,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
 
     def _convert_token_to_id(self, token):
         """ Converts a token (str/unicode) in an id using the vocab. """
-        if token in self.encoder:
-            return self.encoder.get(token)
-        return self.encoder.get(self.unk_token)
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (string/unicode) using the vocab."""

From 27b0f86d36a1ee25dcc70ba602aefa556dc5f0a9 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 26 Jul 2019 17:09:21 +0200
Subject: [PATCH 07/35] clean up pretrained

---
 pytorch_transformers/tokenization_utils.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index f603a29d74..2b3219c4cc 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -152,11 +152,13 @@ class PreTrainedTokenizer(object):
 
 
     @classmethod
-    def _from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+    def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         """
         Instantiate a PreTrainedTokenizer from pre-trained vocabulary files.
         Download and cache the vocabulary files if needed.
         """
+        cache_dir = kwargs.pop('cache_dir', None)
+
         s3_models = list(cls.max_model_input_sizes.keys())
         vocab_files = {}
         if pretrained_model_name_or_path in s3_models:
@@ -308,7 +310,8 @@ class PreTrainedTokenizer(object):
 
         to_add_tokens = []
         for token in new_tokens:
-            if self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
+            if token != self.unk_token and \
+                    self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
                 to_add_tokens.append(token)
                 logger.info("Adding %s to the vocabulary", token)
 

From c717d38573dbb814a7b93e8057cf0e63c2c8e9df Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 26 Jul 2019 23:30:48 +0200
Subject: [PATCH 08/35] dictionnary => dictionary

---
 docs/source/model_doc/overview.rst         | 2 +-
 docs/source/serialization.rst              | 2 +-
 hubconfs/bert_hubconf.py                   | 2 +-
 hubconfs/gpt_hubconf.py                    | 2 +-
 hubconfs/transformer_xl_hubconf.py         | 2 +-
 pytorch_transformers/modeling_utils.py     | 4 ++--
 pytorch_transformers/tokenization_utils.py | 4 ++--
 7 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/docs/source/model_doc/overview.rst b/docs/source/model_doc/overview.rst
index 8c77efd3f9..4cca4eb846 100644
--- a/docs/source/model_doc/overview.rst
+++ b/docs/source/model_doc/overview.rst
@@ -96,7 +96,7 @@ where
   ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
 
 * ``from_tf``\ : should we load the weights from a locally saved TensorFlow checkpoint
-* ``state_dict``\ : an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+* ``state_dict``\ : an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
 * ``*inputs``\ , `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
 
 ``Uncased`` means that the text has been lowercased before WordPiece tokenization, e.g., ``John Smith`` becomes ``john smith``. The Uncased model also strips out any accent markers. ``Cased`` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the `Multilingual README <https://github.com/google-research/bert/blob/master/multilingual.md>`__ or the original TensorFlow repository.
diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst
index fb947ffb51..be5197135d 100644
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -40,7 +40,7 @@ where
 
 - `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
 - `from_tf`: should we load the weights from a locally saved TensorFlow checkpoint
-- `state_dict`: an optional state dictionnary (collections.OrderedDict object) to use instead of Google pre-trained models
+- `state_dict`: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
 - `*inputs`, `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
 
 `Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository.
diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index 0ee0df6697..a0221ff9e1 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -37,7 +37,7 @@ bert_docstring = """
                  checkpoint
         cache_dir: an optional path to a folder in which the pre-trained models
                    will be cached.
-        state_dict: an optional state dictionnary
+        state_dict: an optional state dictionary
                     (collections.OrderedDict object) to use instead of Google
                     pre-trained models
         *inputs, **kwargs: additional input for the specific Bert class
diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py
index 1683c881fa..c58c1fa708 100644
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
@@ -40,7 +40,7 @@ gpt_docstring = """
 				. a series of NumPy files containing OpenAI TensorFlow trained weights
 		from_tf: should we load the weights from a locally saved TensorFlow checkpoint
 		cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-		state_dict: an optional state dictionnary (collections.OrderedDict object)
+		state_dict: an optional state dictionary (collections.OrderedDict object)
 		        	to use instead of pre-trained models
 		*inputs, **kwargs: additional input for the specific OpenAI-GPT class
 """
diff --git a/hubconfs/transformer_xl_hubconf.py b/hubconfs/transformer_xl_hubconf.py
index d89db894ad..cfcc6aef5a 100644
--- a/hubconfs/transformer_xl_hubconf.py
+++ b/hubconfs/transformer_xl_hubconf.py
@@ -23,7 +23,7 @@ transformer_xl_docstring = """
                 . `model.chkpt` a TensorFlow checkpoint
         from_tf: should we load the weights from a locally saved TensorFlow checkpoint
         cache_dir: an optional path to a folder in which the pre-trained models will be cached.
-        state_dict: an optional state dictionnary (collections.OrderedDict object) to use instead of pre-trained models
+        state_dict: an optional state dictionary (collections.OrderedDict object) to use instead of pre-trained models
         *inputs, **kwargs: additional input for the specific TransformerXL class
 """
 
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 66bfe99d85..4fabd49baf 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -358,7 +358,7 @@ class PreTrainedModel(nn.Module):
                 Dictionary of key, values to update the configuration object after loading.
                 Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
 
-               - If a configuration is provided with `config`, **kwargs will be directly passed
+               - If a configuration is providedictionaryfig`, **kwargs will be directly passed
                  to the underlying model's __init__ method.
                - If a configuration is not provided, **kwargs will be first passed to the pretrained
                  model configuration class loading function (`PretrainedConfig.from_pretrained`).
@@ -367,7 +367,7 @@ class PreTrainedModel(nn.Module):
                  Remaining keys that do not correspond to any configuration attribute will
                  be passed to the underlying model's __init__ function.
 
-        Examples::
+        Examples::dictionary
 
             >>> model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
             >>> model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 2b3219c4cc..eaef2fed1e 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -37,7 +37,7 @@ class PreTrainedTokenizer(object):
             additional_special_tokens = []
 
         We defined an added_tokens_encoder to add new tokens to the vocabulary without having to handle the
-            specific vocabulary augmentation methods of the various underlying dictionnary structures (BPE, sentencepiece...).
+            specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
     """
     vocab_files_names = {}
     pretrained_vocab_files_map = {}
@@ -324,7 +324,7 @@ class PreTrainedTokenizer(object):
 
 
     def add_special_tokens(self, special_tokens_dict):
-        """ Add a dictionnary of special tokens (eos, pad, cls...) to the encoder and link them
+        """ Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
             to class attributes. If the special tokens are not in the vocabulary, they are added
             to it and indexed starting from the last index of the current vocabulary.
 

From ac27548b25ffef5966bd11c90419230cbeafe06e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 27 Jul 2019 11:50:47 +0200
Subject: [PATCH 09/35] fix unk_token test

---
 pytorch_transformers/tokenization_gpt2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index afcdf1e64e..29a9ae7660 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -104,7 +104,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
 
     def __init__(self, vocab_file, merges_file, errors='replace', unk_token="<|endoftext|>",
                  bos_token="<|endoftext|>", eos_token="<|endoftext|>", **kwargs):
-        super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, **kwargs)
+        super(GPT2Tokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs)
 
         self.encoder = json.load(open(vocab_file))
         self.decoder = {v:k for k,v in self.encoder.items()}

From 4cc1bf81ee1326f1779f99ed5cc85370a550ef4a Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 27 Jul 2019 12:08:21 +0200
Subject: [PATCH 10/35] typos

---
 pytorch_transformers/modeling_auto.py     | 4 ++--
 pytorch_transformers/modeling_utils.py    | 4 ++--
 pytorch_transformers/tokenization_bert.py | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index 68eb85cbd8..aa50b1526d 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -157,7 +157,7 @@ class AutoModel(object):
             - contains `xlnet`: XLNetConfig (XLNet model)
             - contains `xlm`: XLMConfig (XLM model)
 
-            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are desactivated)
+            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
             To train the model, you should first set it back in training mode with `model.train()`
 
         Params:
@@ -179,7 +179,7 @@ class AutoModel(object):
                 - the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
             **state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
                 from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuraton but load your own weights.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                 In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
                 a simpler option.
             **cache_dir**: (`optional`) string:
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 4fabd49baf..7ae834f5e5 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -324,7 +324,7 @@ class PreTrainedModel(nn.Module):
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         r"""Instantiate a pretrained pytorch model from a pre-trained model configuration.
 
-            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are desactivated)
+            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
             To train the model, you should first set it back in training mode with `model.train()`
 
         Params:
@@ -346,7 +346,7 @@ class PreTrainedModel(nn.Module):
                 - the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
             **state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
                 from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuraton but load your own weights.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
                 In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
                 a simpler option.
             **cache_dir**: (`optional`) string:
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index f9c97b7d12..d9cd881dfd 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -119,7 +119,7 @@ class BertTokenizer(PreTrainedTokenizer):
                 Only has an effect when do_basic_tokenize=True
             **tokenize_chinese_chars**: (`optional`) boolean (default True)
                 Whether to tokenize Chinese characters.
-                This should likely be desactivated for Japanese:
+                This should likely be deactivated for Japanese:
                 see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
         """
         super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
@@ -214,7 +214,7 @@ class BasicTokenizer(object):
                 List of token not to split.
             **tokenize_chinese_chars**: (`optional`) boolean (default True)
                 Whether to tokenize Chinese characters.
-                This should likely be desactivated for Japanese:
+                This should likely be deactivated for Japanese:
                 see: https://github.com/huggingface/pytorch-pretrained-BERT/issues/328
         """
         if never_split is None:

From bfbe52ec397f0e43641ee58d4e347deff5216777 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 27 Jul 2019 20:25:39 +0200
Subject: [PATCH 11/35] cleaning up example docstrings

---
 hubconfs/bert_hubconf.py                    | 206 ++++++++++----------
 hubconfs/gpt2_hubconf.py                    |  84 ++++----
 hubconfs/gpt_hubconf.py                     |  76 ++++----
 hubconfs/transformer_xl_hubconf.py          |  68 +++----
 hubconfs/xlm_hubconf.py                     |  80 ++++----
 hubconfs/xlnet_hubconf.1.py                 |  84 ++++----
 pytorch_transformers/modeling_auto.py       |  32 +--
 pytorch_transformers/modeling_bert.py       | 122 ++++++------
 pytorch_transformers/modeling_gpt2.py       |  40 ++--
 pytorch_transformers/modeling_openai.py     |  40 ++--
 pytorch_transformers/modeling_transfo_xl.py |  24 +--
 pytorch_transformers/modeling_utils.py      |  32 +--
 pytorch_transformers/modeling_xlm.py        |  58 +++---
 pytorch_transformers/modeling_xlnet.py      |  68 +++----
 pytorch_transformers/tokenization_auto.py   |   4 +-
 15 files changed, 509 insertions(+), 509 deletions(-)

diff --git a/hubconfs/bert_hubconf.py b/hubconfs/bert_hubconf.py
index a0221ff9e1..6e2830617f 100644
--- a/hubconfs/bert_hubconf.py
+++ b/hubconfs/bert_hubconf.py
@@ -84,12 +84,12 @@ def bertTokenizer(*args, **kwargs):
                  Default: ["[UNK]", "[SEP]", "[PAD]", "[CLS]", "[MASK]"]
 
     Example:
-        >>> import torch
-        >>> sentence = 'Hello, World!'
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
-        >>> toks = tokenizer.tokenize(sentence)
+        import torch
+        sentence = 'Hello, World!'
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        toks = tokenizer.tokenize(sentence)
         ['Hello', '##,', 'World', '##!']
-        >>> ids = tokenizer.convert_tokens_to_ids(toks)
+        ids = tokenizer.convert_tokens_to_ids(toks)
         [8667, 28136, 1291, 28125]
     """
     tokenizer = BertTokenizer.from_pretrained(*args, **kwargs)
@@ -105,20 +105,20 @@ def bertModel(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
-        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        >>> tokenized_text = tokenizer.tokenize(text)
-        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        >>> tokens_tensor = torch.tensor([indexed_tokens])
-        >>> segments_tensors = torch.tensor([segments_ids])
+        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        tokenized_text = tokenizer.tokenize(text)
+        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        tokens_tensor = torch.tensor([indexed_tokens])
+        segments_tensors = torch.tensor([segments_ids])
         # Load bertModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'bertModel', 'bert-base-cased')
+        model.eval()
         # Predict hidden states features for each layer
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 encoded_layers, _ = model(tokens_tensor, segments_tensors)
     """
     model = BertModel.from_pretrained(*args, **kwargs)
@@ -134,20 +134,20 @@ def bertForNextSentencePrediction(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
-        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        >>> tokenized_text = tokenizer.tokenize(text)
-        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        >>> tokens_tensor = torch.tensor([indexed_tokens])
-        >>> segments_tensors = torch.tensor([segments_ids])
+        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        tokenized_text = tokenizer.tokenize(text)
+        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        tokens_tensor = torch.tensor([indexed_tokens])
+        segments_tensors = torch.tensor([segments_ids])
         # Load bertForNextSentencePrediction
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForNextSentencePrediction', 'bert-base-cased')
+        model.eval()
         # Predict the next sentence classification logits
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 next_sent_classif_logits = model(tokens_tensor, segments_tensors)
     """
     model = BertForNextSentencePrediction.from_pretrained(*args, **kwargs)
@@ -164,17 +164,17 @@ def bertForPreTraining(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
-        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        >>> tokenized_text = tokenizer.tokenize(text)
-        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        >>> tokens_tensor = torch.tensor([indexed_tokens])
-        >>> segments_tensors = torch.tensor([segments_ids])
+        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        tokenized_text = tokenizer.tokenize(text)
+        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        tokens_tensor = torch.tensor([indexed_tokens])
+        segments_tensors = torch.tensor([segments_ids])
         # Load bertForPreTraining
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
-        >>> masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
+        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForPreTraining', 'bert-base-cased')
+        masked_lm_logits_scores, seq_relationship_logits = model(tokens_tensor, segments_tensors)
     """
     model = BertForPreTraining.from_pretrained(*args, **kwargs)
     return model
@@ -188,25 +188,25 @@ def bertForMaskedLM(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
-        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        >>> tokenized_text = tokenizer.tokenize(text)
-        >>> masked_index = 8
-        >>> tokenized_text[masked_index] = '[MASK]'
-        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        >>> tokens_tensor = torch.tensor([indexed_tokens])
-        >>> segments_tensors = torch.tensor([segments_ids])
+        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        tokenized_text = tokenizer.tokenize(text)
+        masked_index = 8
+        tokenized_text[masked_index] = '[MASK]'
+        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        tokens_tensor = torch.tensor([indexed_tokens])
+        segments_tensors = torch.tensor([segments_ids])
         # Load bertForMaskedLM
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMaskedLM', 'bert-base-cased')
+        model.eval()
         # Predict all tokens
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 predictions = model(tokens_tensor, segments_tensors)
-        >>> predicted_index = torch.argmax(predictions[0, masked_index]).item()
-        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+        predicted_index = torch.argmax(predictions[0, masked_index]).item()
+        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
         'henson'
     """
     model = BertForMaskedLM.from_pretrained(*args, **kwargs)
@@ -230,24 +230,24 @@ def bertForSequenceClassification(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
-        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        >>> tokenized_text = tokenizer.tokenize(text)
-        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        >>> tokens_tensor = torch.tensor([indexed_tokens])
-        >>> segments_tensors = torch.tensor([segments_ids])
+        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        tokenized_text = tokenizer.tokenize(text)
+        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        tokens_tensor = torch.tensor([indexed_tokens])
+        segments_tensors = torch.tensor([segments_ids])
         # Load bertForSequenceClassification
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForSequenceClassification', 'bert-base-cased', num_labels=2)
+        model.eval()
         # Predict the sequence classification logits
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 seq_classif_logits = model(tokens_tensor, segments_tensors)
         # Or get the sequence classification loss
-        >>> labels = torch.tensor([1])
-        >>> seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
+        labels = torch.tensor([1])
+        seq_classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
     """
     model = BertForSequenceClassification.from_pretrained(*args, **kwargs)
     return model
@@ -265,24 +265,24 @@ def bertForMultipleChoice(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
-        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        >>> tokenized_text = tokenizer.tokenize(text)
-        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        >>> tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
-        >>> segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
+        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        tokenized_text = tokenizer.tokenize(text)
+        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        tokens_tensor = torch.tensor([indexed_tokens, indexed_tokens]).unsqueeze(0)
+        segments_tensors = torch.tensor([segments_ids, segments_ids]).unsqueeze(0)
         # Load bertForMultipleChoice
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForMultipleChoice', 'bert-base-cased', num_choices=2)
+        model.eval()
         # Predict the multiple choice logits
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 multiple_choice_logits = model(tokens_tensor, segments_tensors)
         # Or get the multiple choice loss
-        >>> labels = torch.tensor([1])
-        >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
+        labels = torch.tensor([1])
+        multiple_choice_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
     """
     model = BertForMultipleChoice.from_pretrained(*args, **kwargs)
     return model
@@ -298,25 +298,25 @@ def bertForQuestionAnswering(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
-        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        >>> tokenized_text = tokenizer.tokenize(text)
-        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        >>> tokens_tensor = torch.tensor([indexed_tokens])
-        >>> segments_tensors = torch.tensor([segments_ids])
+        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        tokenized_text = tokenizer.tokenize(text)
+        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        tokens_tensor = torch.tensor([indexed_tokens])
+        segments_tensors = torch.tensor([segments_ids])
         # Load bertForQuestionAnswering
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForQuestionAnswering', 'bert-base-cased')
+        model.eval()
         # Predict the start and end positions logits
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 start_logits, end_logits = model(tokens_tensor, segments_tensors)
         # Or get the total loss which is the sum of the CrossEntropy loss for the start and end token positions
-        >>> start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
+        start_positions, end_positions = torch.tensor([12]), torch.tensor([14])
         # set model.train() before if training this loss
-        >>> multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
+        multiple_choice_loss = model(tokens_tensor, segments_tensors, start_positions=start_positions, end_positions=end_positions)
     """
     model = BertForQuestionAnswering.from_pretrained(*args, **kwargs)
     return model
@@ -337,24 +337,24 @@ def bertForTokenClassification(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'bertTokenizer', 'bert-base-cased', do_basic_tokenize=False)
         #  Prepare tokenized input
-        >>> text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
-        >>> tokenized_text = tokenizer.tokenize(text)
-        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        >>> segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
-        >>> tokens_tensor = torch.tensor([indexed_tokens])
-        >>> segments_tensors = torch.tensor([segments_ids])
+        text = "[CLS] Who was Jim Henson ? [SEP] Jim Henson was a puppeteer [SEP]"
+        tokenized_text = tokenizer.tokenize(text)
+        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        segments_ids = [0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]
+        tokens_tensor = torch.tensor([indexed_tokens])
+        segments_tensors = torch.tensor([segments_ids])
         # Load bertForTokenClassification
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'bertForTokenClassification', 'bert-base-cased', num_labels=2)
+        model.eval()
         # Predict the token classification logits
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 classif_logits = model(tokens_tensor, segments_tensors)
         # Or get the token classification loss
-        >>> labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
-        >>> classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
+        labels = torch.tensor([[0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0]])
+        classif_loss = model(tokens_tensor, segments_tensors, labels=labels) # set model.train() before if training this loss
     """
     model = BertForTokenClassification.from_pretrained(*args, **kwargs)
     return model
diff --git a/hubconfs/gpt2_hubconf.py b/hubconfs/gpt2_hubconf.py
index dbaa2cd612..18afad3913 100644
--- a/hubconfs/gpt2_hubconf.py
+++ b/hubconfs/gpt2_hubconf.py
@@ -52,11 +52,11 @@ def gpt2Tokenizer(*args, **kwargs):
              Default: None
 
     Example:
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
-        >>> text = "Who was Jim Henson ?"
-        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+        text = "Who was Jim Henson ?"
+        indexed_tokens = tokenizer.encode(tokenized_text)
     """
     tokenizer = GPT2Tokenizer.from_pretrained(*args, **kwargs)
     return tokenizer
@@ -71,24 +71,24 @@ def gpt2Model(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
-        >>> text_1 = "Who was Jim Henson ?"
-        >>> text_2 = "Jim Henson was a puppeteer"
-        >>> indexed_tokens_1 = tokenizer.encode(text_1)
-        >>> indexed_tokens_2 = tokenizer.encode(text_2)
-        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        text_1 = "Who was Jim Henson ?"
+        text_2 = "Jim Henson was a puppeteer"
+        indexed_tokens_1 = tokenizer.encode(text_1)
+        indexed_tokens_2 = tokenizer.encode(text_2)
+        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load gpt2Model
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Model', 'gpt2')
+        model.eval()
 
         # Predict hidden states features for each layer
         # past can be used to reuse precomputed hidden state in a subsequent predictions
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 hidden_states_1, past = model(tokens_tensor_1)
                 hidden_states_2, past = model(tokens_tensor_2, past=past)
     """
@@ -104,31 +104,31 @@ def gpt2LMHeadModel(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
-        >>> text_1 = "Who was Jim Henson ?"
-        >>> text_2 = "Jim Henson was a puppeteer"
-        >>> indexed_tokens_1 = tokenizer.encode(text_1)
-        >>> indexed_tokens_2 = tokenizer.encode(text_2)
-        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        text_1 = "Who was Jim Henson ?"
+        text_2 = "Jim Henson was a puppeteer"
+        indexed_tokens_1 = tokenizer.encode(text_1)
+        indexed_tokens_2 = tokenizer.encode(text_2)
+        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load gpt2LMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2LMHeadModel', 'gpt2')
+        model.eval()
 
         # Predict hidden states features for each layer
         # past can be used to reuse precomputed hidden state in a subsequent predictions
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 predictions_1, past = model(tokens_tensor_1)
                 predictions_2, past = model(tokens_tensor_2, past=past)
 
         # Get the predicted last token
-        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        >>> predicted_token = tokenizer.decode([predicted_index])
-        >>> assert predicted_token == ' who'
+        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        predicted_token = tokenizer.decode([predicted_index])
+        assert predicted_token == ' who'
     """
     model = GPT2LMHeadModel.from_pretrained(*args, **kwargs)
     return model
@@ -143,25 +143,25 @@ def gpt2DoubleHeadsModel(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'gpt2Tokenizer', 'gpt2')
 
         #  Prepare tokenized input
-        >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-        >>> tokenized_text1 = tokenizer.tokenize(text1)
-        >>> tokenized_text2 = tokenizer.tokenize(text2)
-        >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-        >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-        >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+        text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+        tokenized_text1 = tokenizer.tokenize(text1)
+        tokenized_text2 = tokenizer.tokenize(text2)
+        indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+        indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+        tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+        mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
         # Load gpt2DoubleHeadsModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'gpt2DoubleHeadsModel', 'gpt2')
+        model.eval()
 
         # Predict hidden states features for each layer
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 lm_logits, multiple_choice_logits, presents = model(tokens_tensor, mc_token_ids)
     """
     model = GPT2DoubleHeadsModel.from_pretrained(*args, **kwargs)
diff --git a/hubconfs/gpt_hubconf.py b/hubconfs/gpt_hubconf.py
index c58c1fa708..649075980c 100644
--- a/hubconfs/gpt_hubconf.py
+++ b/hubconfs/gpt_hubconf.py
@@ -76,12 +76,12 @@ def openAIGPTTokenizer(*args, **kwargs):
 			 Default: None
 
     Example:
-		>>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
+		import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 		
-		>>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        >>> tokenized_text = tokenizer.tokenize(text)
-        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+		text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        tokenized_text = tokenizer.tokenize(text)
+        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
         [763, 509, 4265, 2298, 945, 257, 4265, 2298, 945, 509, 246, 10148, 39041, 483]
     """
     tokenizer = OpenAIGPTTokenizer.from_pretrained(*args, **kwargs)
@@ -97,21 +97,21 @@ def openAIGPTModel(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-		>>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
+		import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 
         #  Prepare tokenized input
-        >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        >>> tokenized_text = tokenizer.tokenize(text)
-        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        tokenized_text = tokenizer.tokenize(text)
+        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        tokens_tensor = torch.tensor([indexed_tokens])
 
         # Load openAIGPTModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTModel', 'openai-gpt')
+        model.eval()
 
         # Predict hidden states features for each layer
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 hidden_states = model(tokens_tensor)
     """
     model = OpenAIGPTModel.from_pretrained(*args, **kwargs)
@@ -126,26 +126,26 @@ def openAIGPTLMHeadModel(*args, **kwargs):
 
 	Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 
         #  Prepare tokenized input
-        >>> text = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        >>> tokenized_text = tokenizer.tokenize(text)
-        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
-        >>> tokens_tensor = torch.tensor([indexed_tokens])
+        text = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        tokenized_text = tokenizer.tokenize(text)
+        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        tokens_tensor = torch.tensor([indexed_tokens])
 
         # Load openAIGPTLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTLMHeadModel', 'openai-gpt')
+        model.eval()
 
         # Predict hidden states features for each layer
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 predictions = model(tokens_tensor)
 
 		# Get the predicted last token
-		>>> predicted_index = torch.argmax(predictions[0, -1, :]).item()
-		>>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+		predicted_index = torch.argmax(predictions[0, -1, :]).item()
+		predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
         '.</w>'
     """
     model = OpenAIGPTLMHeadModel.from_pretrained(*args, **kwargs)
@@ -161,25 +161,25 @@ def openAIGPTDoubleHeadsModel(*args, **kwargs):
 
 	Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTTokenizer', 'openai-gpt')
 
         #  Prepare tokenized input
-        >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-        >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-        >>> tokenized_text1 = tokenizer.tokenize(text1)
-        >>> tokenized_text2 = tokenizer.tokenize(text2)
-        >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-        >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-        >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-        >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+        text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+        text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+        tokenized_text1 = tokenizer.tokenize(text1)
+        tokenized_text2 = tokenizer.tokenize(text2)
+        indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+        indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+        tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+        mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
         # Load openAIGPTDoubleHeadsModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'openAIGPTDoubleHeadsModel', 'openai-gpt')
+        model.eval()
 
         # Predict hidden states features for each layer
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 lm_logits, multiple_choice_logits = model(tokens_tensor, mc_token_ids)
     """
     model = OpenAIGPTDoubleHeadsModel.from_pretrained(*args, **kwargs)
diff --git a/hubconfs/transformer_xl_hubconf.py b/hubconfs/transformer_xl_hubconf.py
index cfcc6aef5a..548d407581 100644
--- a/hubconfs/transformer_xl_hubconf.py
+++ b/hubconfs/transformer_xl_hubconf.py
@@ -45,12 +45,12 @@ def transformerXLTokenizer(*args, **kwargs):
                                        * transfo-xl-wt103
 
     Example:
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
         
-        >>> text = "Who was Jim Henson ?"
-        >>> tokenized_text = tokenizer.tokenize(tokenized_text)
-        >>> indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
+        text = "Who was Jim Henson ?"
+        tokenized_text = tokenizer.tokenize(tokenized_text)
+        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
     """
     tokenizer = TransfoXLTokenizer.from_pretrained(*args, **kwargs)
     return tokenizer
@@ -63,26 +63,26 @@ def transformerXLModel(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
 
         #  Prepare tokenized input
-        >>> text_1 = "Who was Jim Henson ?"
-        >>> text_2 = "Jim Henson was a puppeteer"
-        >>> tokenized_text_1 = tokenizer.tokenize(text_1)
-        >>> tokenized_text_2 = tokenizer.tokenize(text_2)
-        >>> indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
-        >>> indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
-        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        text_1 = "Who was Jim Henson ?"
+        text_2 = "Jim Henson was a puppeteer"
+        tokenized_text_1 = tokenizer.tokenize(text_1)
+        tokenized_text_2 = tokenizer.tokenize(text_2)
+        indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
+        indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
+        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load transformerXLModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLModel', 'transfo-xl-wt103')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLModel', 'transfo-xl-wt103')
+        model.eval()
 
         # Predict hidden states features for each layer
         # We can re-use the memory cells in a subsequent call to attend a longer context
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 hidden_states_1, mems_1 = model(tokens_tensor_1)
                 hidden_states_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
     """
@@ -98,33 +98,33 @@ def transformerXLLMHeadModel(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLTokenizer', 'transfo-xl-wt103')
 
         #  Prepare tokenized input
-        >>> text_1 = "Who was Jim Henson ?"
-        >>> text_2 = "Jim Henson was a puppeteer"
-        >>> tokenized_text_1 = tokenizer.tokenize(text_1)
-        >>> tokenized_text_2 = tokenizer.tokenize(text_2)
-        >>> indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
-        >>> indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
-        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        text_1 = "Who was Jim Henson ?"
+        text_2 = "Jim Henson was a puppeteer"
+        tokenized_text_1 = tokenizer.tokenize(text_1)
+        tokenized_text_2 = tokenizer.tokenize(text_2)
+        indexed_tokens_1 = tokenizer.convert_tokens_to_ids(tokenized_text_1)
+        indexed_tokens_2 = tokenizer.convert_tokens_to_ids(tokenized_text_2)
+        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load transformerXLLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'transformerXLLMHeadModel', 'transfo-xl-wt103')
+        model.eval()
 
         # Predict hidden states features for each layer
         # We can re-use the memory cells in a subsequent call to attend a longer context
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 predictions_1, mems_1 = model(tokens_tensor_1)
                 predictions_2, mems_2 = model(tokens_tensor_2, mems=mems_1)
 
         # Get the predicted last token
-        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        >>> predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
-        >>> assert predicted_token == 'who'
+        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        predicted_token = tokenizer.convert_ids_to_tokens([predicted_index])[0]
+        assert predicted_token == 'who'
     """
     model = TransfoXLLMHeadModel.from_pretrained(*args, **kwargs)
     return model
diff --git a/hubconfs/xlm_hubconf.py b/hubconfs/xlm_hubconf.py
index 4f6fd93c24..e96d923944 100644
--- a/hubconfs/xlm_hubconf.py
+++ b/hubconfs/xlm_hubconf.py
@@ -17,16 +17,16 @@ xlm_start_docstring = """
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
 
         #  Prepare tokenized input
-        >>> text_1 = "Who was Jim Henson ?"
-        >>> text_2 = "Jim Henson was a puppeteer"
-        >>> indexed_tokens_1 = tokenizer.encode(text_1)
-        >>> indexed_tokens_2 = tokenizer.encode(text_2)
-        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        text_1 = "Who was Jim Henson ?"
+        text_2 = "Jim Henson was a puppeteer"
+        indexed_tokens_1 = tokenizer.encode(text_1)
+        indexed_tokens_2 = tokenizer.encode(text_2)
+        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 """
 
 # A lot of models share the same param doc. Use a decorator
@@ -76,11 +76,11 @@ def xlmTokenizer(*args, **kwargs):
              Default: None
 
     Example:
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlmTokenizer', 'xlm-mlm-en-2048')
 
-        >>> text = "Who was Jim Henson ?"
-        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+        text = "Who was Jim Henson ?"
+        indexed_tokens = tokenizer.encode(tokenized_text)
     """
     tokenizer = XLMTokenizer.from_pretrained(*args, **kwargs)
     return tokenizer
@@ -91,11 +91,11 @@ def xlmTokenizer(*args, **kwargs):
 def xlmModel(*args, **kwargs):
     """
         # Load xlmModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'xlmModel', 'xlm-mlm-en-2048')
+        model.eval()
 
         # Predict hidden states features for each layer
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 hidden_states_1, mems = model(tokens_tensor_1)
                 hidden_states_2, mems = model(tokens_tensor_2, past=mems)
     """
@@ -108,26 +108,26 @@ def xlmModel(*args, **kwargs):
 def xlmLMHeadModel(*args, **kwargs):
     """
         #  Prepare tokenized input
-        >>> text_1 = "Who was Jim Henson ?"
-        >>> text_2 = "Jim Henson was a puppeteer"
-        >>> indexed_tokens_1 = tokenizer.encode(text_1)
-        >>> indexed_tokens_2 = tokenizer.encode(text_2)
-        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        text_1 = "Who was Jim Henson ?"
+        text_2 = "Jim Henson was a puppeteer"
+        indexed_tokens_1 = tokenizer.encode(text_1)
+        indexed_tokens_2 = tokenizer.encode(text_2)
+        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load xlnetLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlm-mlm-en-2048')
+        model.eval()
 
         # Predict hidden states features for each layer
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 predictions_1, mems = model(tokens_tensor_1)
                 predictions_2, mems = model(tokens_tensor_2, mems=mems)
 
         # Get the predicted last token
-        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        >>> predicted_token = tokenizer.decode([predicted_index])
-        >>> assert predicted_token == ' who'
+        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        predicted_token = tokenizer.decode([predicted_index])
+        assert predicted_token == ' who'
     """
     model = XLMWithLMHeadModel.from_pretrained(*args, **kwargs)
     return model
@@ -142,25 +142,25 @@ def xlmLMHeadModel(*args, **kwargs):
 
 #     Example:
 #         # Load the tokenizer
-#         >>> import torch
-#         >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
+#         import torch
+#         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlm-mlm-en-2048')
 
 #         #  Prepare tokenized input
-#         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-#         >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-#         >>> tokenized_text1 = tokenizer.tokenize(text1)
-#         >>> tokenized_text2 = tokenizer.tokenize(text2)
-#         >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-#         >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-#         >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-#         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+#         text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+#         text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+#         tokenized_text1 = tokenizer.tokenize(text1)
+#         tokenized_text2 = tokenizer.tokenize(text2)
+#         indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+#         indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+#         tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+#         mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
 #         # Load xlnetForSequenceClassification
-#         >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
-#         >>> model.eval()
+#         model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlm-mlm-en-2048')
+#         model.eval()
 
 #         # Predict sequence classes logits
-#         >>> with torch.no_grad():
+#         with torch.no_grad():
 #                 lm_logits, mems = model(tokens_tensor)
 #     """
 #     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
diff --git a/hubconfs/xlnet_hubconf.1.py b/hubconfs/xlnet_hubconf.1.py
index 4c5105a241..fa7b7ddb9f 100644
--- a/hubconfs/xlnet_hubconf.1.py
+++ b/hubconfs/xlnet_hubconf.1.py
@@ -53,11 +53,11 @@ def xlnetTokenizer(*args, **kwargs):
              Default: None
 
     Example:
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
-        >>> text = "Who was Jim Henson ?"
-        >>> indexed_tokens = tokenizer.encode(tokenized_text)
+        text = "Who was Jim Henson ?"
+        indexed_tokens = tokenizer.encode(tokenized_text)
     """
     tokenizer = XLNetTokenizer.from_pretrained(*args, **kwargs)
     return tokenizer
@@ -72,23 +72,23 @@ def xlnetModel(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
         #  Prepare tokenized input
-        >>> text_1 = "Who was Jim Henson ?"
-        >>> text_2 = "Jim Henson was a puppeteer"
-        >>> indexed_tokens_1 = tokenizer.encode(text_1)
-        >>> indexed_tokens_2 = tokenizer.encode(text_2)
-        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        text_1 = "Who was Jim Henson ?"
+        text_2 = "Jim Henson was a puppeteer"
+        indexed_tokens_1 = tokenizer.encode(text_1)
+        indexed_tokens_2 = tokenizer.encode(text_2)
+        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load xlnetModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetModel', 'xlnet-large-cased')
+        model.eval()
 
         # Predict hidden states features for each layer
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 hidden_states_1, mems = model(tokens_tensor_1)
                 hidden_states_2, mems = model(tokens_tensor_2, past=mems)
     """
@@ -106,30 +106,30 @@ def xlnetLMHeadModel(*args, **kwargs):
 
     Example:
         # Load the tokenizer
-        >>> import torch
-        >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
+        import torch
+        tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
         #  Prepare tokenized input
-        >>> text_1 = "Who was Jim Henson ?"
-        >>> text_2 = "Jim Henson was a puppeteer"
-        >>> indexed_tokens_1 = tokenizer.encode(text_1)
-        >>> indexed_tokens_2 = tokenizer.encode(text_2)
-        >>> tokens_tensor_1 = torch.tensor([indexed_tokens_1])
-        >>> tokens_tensor_2 = torch.tensor([indexed_tokens_2])
+        text_1 = "Who was Jim Henson ?"
+        text_2 = "Jim Henson was a puppeteer"
+        indexed_tokens_1 = tokenizer.encode(text_1)
+        indexed_tokens_2 = tokenizer.encode(text_2)
+        tokens_tensor_1 = torch.tensor([indexed_tokens_1])
+        tokens_tensor_2 = torch.tensor([indexed_tokens_2])
 
         # Load xlnetLMHeadModel
-        >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
-        >>> model.eval()
+        model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetLMHeadModel', 'xlnet-large-cased')
+        model.eval()
 
         # Predict hidden states features for each layer
-        >>> with torch.no_grad():
+        with torch.no_grad():
                 predictions_1, mems = model(tokens_tensor_1)
                 predictions_2, mems = model(tokens_tensor_2, mems=mems)
 
         # Get the predicted last token
-        >>> predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
-        >>> predicted_token = tokenizer.decode([predicted_index])
-        >>> assert predicted_token == ' who'
+        predicted_index = torch.argmax(predictions_2[0, -1, :]).item()
+        predicted_token = tokenizer.decode([predicted_index])
+        assert predicted_token == ' who'
     """
     model = XLNetLMHeadModel.from_pretrained(*args, **kwargs)
     return model
@@ -144,25 +144,25 @@ def xlnetLMHeadModel(*args, **kwargs):
 
 #     Example:
 #         # Load the tokenizer
-#         >>> import torch
-#         >>> tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
+#         import torch
+#         tokenizer = torch.hub.load('huggingface/pytorch-transformers', 'xlnetTokenizer', 'xlnet-large-cased')
 
 #         #  Prepare tokenized input
-#         >>> text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
-#         >>> text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
-#         >>> tokenized_text1 = tokenizer.tokenize(text1)
-#         >>> tokenized_text2 = tokenizer.tokenize(text2)
-#         >>> indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
-#         >>> indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
-#         >>> tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
-#         >>> mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
+#         text1 = "Who was Jim Henson ? Jim Henson was a puppeteer"
+#         text2 = "Who was Jim Henson ? Jim Henson was a mysterious young man"
+#         tokenized_text1 = tokenizer.tokenize(text1)
+#         tokenized_text2 = tokenizer.tokenize(text2)
+#         indexed_tokens1 = tokenizer.convert_tokens_to_ids(tokenized_text1)
+#         indexed_tokens2 = tokenizer.convert_tokens_to_ids(tokenized_text2)
+#         tokens_tensor = torch.tensor([[indexed_tokens1, indexed_tokens2]])
+#         mc_token_ids = torch.LongTensor([[len(tokenized_text1)-1, len(tokenized_text2)-1]])
 
 #         # Load xlnetForSequenceClassification
-#         >>> model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
-#         >>> model.eval()
+#         model = torch.hub.load('huggingface/pytorch-transformers', 'xlnetForSequenceClassification', 'xlnet-large-cased')
+#         model.eval()
 
 #         # Predict sequence classes logits
-#         >>> with torch.no_grad():
+#         with torch.no_grad():
 #                 lm_logits, mems = model(tokens_tensor)
 #     """
 #     model = XLNetForSequenceClassification.from_pretrained(*args, **kwargs)
diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index aa50b1526d..3e28fbd0a9 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -89,15 +89,15 @@ class AutoConfig(object):
 
         Examples::
 
-            >>> config = AutoConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-            >>> config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
-            >>> config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
-            >>> assert config.output_attention == True
-            >>> config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
-            >>>                                                    foo=False, return_unused_kwargs=True)
-            >>> assert config.output_attention == True
-            >>> assert unused_kwargs == {'foo': False}
+            config = AutoConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            config = AutoConfig.from_pretrained('./test/bert_saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            config = AutoConfig.from_pretrained('./test/bert_saved_model/my_configuration.json')
+            config = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+            assert config.output_attention == True
+            config, unused_kwargs = AutoConfig.from_pretrained('bert-base-uncased', output_attention=True,
+                                                               foo=False, return_unused_kwargs=True)
+            assert config.output_attention == True
+            assert unused_kwargs == {'foo': False}
 
         """
         if 'bert' in pretrained_model_name_or_path:
@@ -202,13 +202,13 @@ class AutoModel(object):
 
         Examples::
 
-            >>> model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            >>> model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            >>> model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            >>> assert model.config.output_attention == True
-            >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            >>> config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            >>> model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
         if 'bert' in pretrained_model_name_or_path:
diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index b59445513a..3f2e7cbda1 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -643,12 +643,12 @@ class BertModel(BertPreTrainedModel):
 
     Examples::
 
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        config = BertConfig.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertModel(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -754,13 +754,13 @@ class BertForPreTraining(BertPreTrainedModel):
 
     Examples::
 
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> 
-        >>> model = BertForPreTraining(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> prediction_scores, seq_relationship_scores = outputs[:2]
+        config = BertConfig.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        
+        model = BertForPreTraining(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores, seq_relationship_scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -824,13 +824,13 @@ class BertForMaskedLM(BertPreTrainedModel):
 
     Examples::
 
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> 
-        >>> model = BertForMaskedLM(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, masked_lm_labels=input_ids)
-        >>> loss, prediction_scores = outputs[:2]
+        config = BertConfig.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        
+        model = BertForMaskedLM(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, masked_lm_labels=input_ids)
+        loss, prediction_scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -891,13 +891,13 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
 
     Examples::
 
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> 
-        >>> model = BertForNextSentencePrediction(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> seq_relationship_scores = outputs[0]
+        config = BertConfig.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        
+        model = BertForNextSentencePrediction(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        seq_relationship_scores = outputs[0]
 
     """
     def __init__(self, config):
@@ -951,14 +951,14 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
     Examples::
 
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> 
-        >>> model = BertForSequenceClassification(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
-        >>> loss, logits = outputs[:2]
+        config = BertConfig.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        
+        model = BertForSequenceClassification(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1057,15 +1057,15 @@ class BertForMultipleChoice(BertPreTrainedModel):
 
     Examples::
 
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> 
-        >>> model = BertForMultipleChoice(config)
-        >>> choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        >>> labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
-        >>> loss, classification_scores = outputs[:2]
+        config = BertConfig.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        
+        model = BertForMultipleChoice(config)
+        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
+        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, classification_scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1127,14 +1127,14 @@ class BertForTokenClassification(BertPreTrainedModel):
 
     Examples::
 
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> 
-        >>> model = BertForTokenClassification(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
-        >>> loss, scores = outputs[:2]
+        config = BertConfig.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        
+        model = BertForTokenClassification(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1203,15 +1203,15 @@ class BertForQuestionAnswering(BertPreTrainedModel):
 
     Examples::
 
-        >>> config = BertConfig.from_pretrained('bert-base-uncased')
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> 
-        >>> model = BertForQuestionAnswering(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
-        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        >>> loss, start_scores, end_scores = outputs[:2]
+        config = BertConfig.from_pretrained('bert-base-uncased')
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        
+        model = BertForQuestionAnswering(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
 
     """
     def __init__(self, config):
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index b8a459db7d..4341f0d8a1 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -433,12 +433,12 @@ class GPT2Model(GPT2PreTrainedModel):
 
     Examples::
 
-        >>> config = GPT2Config.from_pretrained('gpt2')
-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = GPT2Model(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        config = GPT2Config.from_pretrained('gpt2')
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = GPT2Model(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -567,12 +567,12 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
 
     Examples::
 
-        >>> config = GPT2Config.from_pretrained('gpt2')
-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = GPT2LMHeadModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=input_ids)
-        >>> loss, logits = outputs[:2]
+        config = GPT2Config.from_pretrained('gpt2')
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = GPT2LMHeadModel(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=input_ids)
+        loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -683,14 +683,14 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
 
     Examples::
 
-        >>> config = GPT2Config.from_pretrained('gpt2')
-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = GPT2DoubleHeadsModel(config)
-        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
-        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        >>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, mc_token_ids)
-        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        config = GPT2Config.from_pretrained('gpt2')
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = GPT2DoubleHeadsModel(config)
+        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
+        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, mc_token_ids)
+        lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
     def __init__(self, config):
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 4ea19a965d..a6cb6212ef 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -439,12 +439,12 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
     Examples::
 
-        >>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        >>> model = OpenAIGPTModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        config = OpenAIGPTConfig.from_pretrained('openai-gpt')
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        model = OpenAIGPTModel(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -558,12 +558,12 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
 
     Examples::
 
-        >>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        >>> model = OpenAIGPTLMHeadModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=input_ids)
-        >>> loss, logits = outputs[:2]
+        config = OpenAIGPTConfig.from_pretrained('openai-gpt')
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        model = OpenAIGPTLMHeadModel(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=input_ids)
+        loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -665,14 +665,14 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
 
     Examples::
 
-        >>> config = OpenAIGPTConfig.from_pretrained('openai-gpt')
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        >>> model = OpenAIGPTDoubleHeadsModel(config)
-        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
-        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        >>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, mc_token_ids)
-        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        config = OpenAIGPTConfig.from_pretrained('openai-gpt')
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        model = OpenAIGPTDoubleHeadsModel(config)
+        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
+        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, mc_token_ids)
+        lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
     def __init__(self, config):
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 3280c4558d..7c999edda7 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -968,12 +968,12 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
     Examples::
 
-        >>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
-        >>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        >>> model = TransfoXLModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states, mems = outputs[:2]
+        config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
+        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        model = TransfoXLModel(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states, mems = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1284,12 +1284,12 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
 
     Examples::
 
-        >>> config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
-        >>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        >>> model = TransfoXLLMHeadModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> prediction_scores, mems = outputs[:2]
+        config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
+        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        model = TransfoXLLMHeadModel(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores, mems = outputs[:2]
 
     """
     def __init__(self, config):
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 7ae834f5e5..e458c5ef74 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -105,15 +105,15 @@ class PretrainedConfig(object):
 
         Examples::
 
-            >>> config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
-            >>> config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
-            >>> config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
-            >>> config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
-            >>> assert config.output_attention == True
-            >>> config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
-            >>>                                                    foo=False, return_unused_kwargs=True)
-            >>> assert config.output_attention == True
-            >>> assert unused_kwargs == {'foo': False}
+            config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
+            config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
+            config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
+            config = BertConfig.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+            assert config.output_attention == True
+            config, unused_kwargs = BertConfig.from_pretrained('bert-base-uncased', output_attention=True,
+                                                               foo=False, return_unused_kwargs=True)
+            assert config.output_attention == True
+            assert unused_kwargs == {'foo': False}
 
         """
         cache_dir = kwargs.pop('cache_dir', None)
@@ -369,13 +369,13 @@ class PreTrainedModel(nn.Module):
 
         Examples::dictionary
 
-            >>> model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            >>> model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            >>> model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            >>> assert model.config.output_attention == True
-            >>> # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            >>> config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
-            >>> model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = BertModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = BertConfig.from_json_file('./tf_model/my_tf_model_config.json')
+            model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
         config = kwargs.pop('config', None)
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 3bb864501a..7325ff7875 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -472,12 +472,12 @@ class XLMModel(XLMPreTrainedModel):
 
     Examples::
 
-        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> model = XLMModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMModel(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     ATTRIBUTES = ['encoder', 'eos_index', 'pad_index',  # 'with_output', 
@@ -745,12 +745,12 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
 
     Examples::
 
-        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> model = XLMWithLMHeadModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMWithLMHeadModel(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -805,14 +805,14 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
 
     Examples::
 
-        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> 
-        >>> model = XLMForSequenceClassification(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
-        >>> loss, logits = outputs[:2]
+        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        
+        model = XLMForSequenceClassification(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -885,15 +885,15 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
 
     Examples::
 
-        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> 
-        >>> model = XLMForQuestionAnswering(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
-        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        >>> loss, start_scores, end_scores = outputs[:2]
+        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        
+        model = XLMForQuestionAnswering(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
 
     """
     def __init__(self, config):
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 515decdb3e..9c1752eb74 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -712,12 +712,12 @@ class XLNetModel(XLNetPreTrainedModel):
 
     Examples::
 
-        >>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
-        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        >>> model = XLNetModel(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        config = XLNetConfig.from_pretrained('xlnet-large-cased')
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = XLNetModel(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -1019,17 +1019,17 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
     Examples::
 
-        >>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
-        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        >>> model = XLNetLMHeadModel(config)
-        >>> # We show how to setup inputs to predict a next token using a bi-directional context.
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0)  # We will predict the masked token
-        >>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
-        >>> perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-        >>> target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
-        >>> target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
-        >>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
-        >>> next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+        config = XLNetConfig.from_pretrained('xlnet-large-cased')
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = XLNetLMHeadModel(config)
+        # We show how to setup inputs to predict a next token using a bi-directional context.
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0)  # We will predict the masked token
+        perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
+        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+        target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
+        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+        outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
+        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
 
     """
     def __init__(self, config):
@@ -1100,14 +1100,14 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
     Examples::
 
-        >>> config = XLNetConfig.from_pretrained('xlnet-large-cased')
-        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        >>> 
-        >>> model = XLNetForSequenceClassification(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
-        >>> loss, logits = outputs[:2]
+        config = XLNetConfig.from_pretrained('xlnet-large-cased')
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        
+        model = XLNetForSequenceClassification(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1200,15 +1200,15 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
 
     Examples::
 
-        >>> config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> 
-        >>> model = XLMForQuestionAnswering(config)
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
-        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        >>> loss, start_scores, end_scores = outputs[:2]
+        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        
+        model = XLMForQuestionAnswering(config)
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
 
     """
     def __init__(self, config):
diff --git a/pytorch_transformers/tokenization_auto.py b/pytorch_transformers/tokenization_auto.py
index 66d0ce51ba..acbe1cebc6 100644
--- a/pytorch_transformers/tokenization_auto.py
+++ b/pytorch_transformers/tokenization_auto.py
@@ -78,8 +78,8 @@ class AutoTokenizer(object):
 
         Examples::
 
-            >>> config = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
-            >>> config = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+            config = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
+            config = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
 
         """
         if 'bert' in pretrained_model_name_or_path:

From c90119e5430954abc9e852dd334d90d3ca906eb1 Mon Sep 17 00:00:00 2001
From: David Pollack <david@i2x.ai>
Date: Mon, 29 Jul 2019 16:56:02 +0200
Subject: [PATCH 12/35] spelling mistake

---
 pytorch_transformers/convert_pytorch_checkpoint_to_tf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
index c24dddc4d6..025c2f396c 100644
--- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
+++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
@@ -41,7 +41,7 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
         N BertForQuestionAnswering
     """
 
-    tensors_to_transopse = (
+    tensors_to_transpose = (
         "dense.weight",
         "attention.self.query",
         "attention.self.key",
@@ -81,7 +81,7 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
         for var_name in state_dict:
             tf_name = to_tf_var_name(var_name)
             torch_tensor = state_dict[var_name].numpy()
-            if any([x in var_name for x in tensors_to_transopse]):
+            if any([x in var_name for x in tensors_to_transpose]):
                 torch_tensor = torch_tensor.T
             tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
             tf.keras.backend.set_value(tf_var, torch_tensor)

From a24f830604fc150526d9fd4596a4f3900916abe9 Mon Sep 17 00:00:00 2001
From: wangfei <1140554608@qq.com>
Date: Sat, 3 Aug 2019 12:17:06 +0800
Subject: [PATCH 13/35] Fix comment typo

---
 pytorch_transformers/modeling_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index b59445513a..418939f7da 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -857,7 +857,7 @@ class BertForMaskedLM(BertPreTrainedModel):
         sequence_output = outputs[0]
         prediction_scores = self.cls(sequence_output)
 
-        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention is they are here
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))

From 836e51369820797759a71064066a78fb161fe804 Mon Sep 17 00:00:00 2001
From: Saket Khandelwal <saketdecade17@gmail.com>
Date: Sun, 4 Aug 2019 16:05:10 +1000
Subject: [PATCH 14/35] Fixed small typo

---
 pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py   | 2 +-
 pytorch_transformers/convert_openai_checkpoint_to_pytorch.py | 2 +-
 pytorch_transformers/convert_tf_checkpoint_to_pytorch.py     | 2 +-
 pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
index 68cb798a7d..f9e83f5d5b 100755
--- a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
@@ -58,7 +58,7 @@ if __name__ == "__main__":
                         default = None,
                         type = str,
                         required = True,
-                        help = "Path the TensorFlow checkpoint path.")
+                        help = "Path to the TensorFlow checkpoint path.")
     parser.add_argument("--pytorch_dump_folder_path",
                         default = None,
                         type = str,
diff --git a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
index 8ec852a4bd..70895b4002 100755
--- a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
@@ -58,7 +58,7 @@ if __name__ == "__main__":
                         default = None,
                         type = str,
                         required = True,
-                        help = "Path the TensorFlow checkpoint path.")
+                        help = "Path to the TensorFlow checkpoint path.")
     parser.add_argument("--pytorch_dump_folder_path",
                         default = None,
                         type = str,
diff --git a/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
index 9f121e8b79..220204f36e 100755
--- a/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_tf_checkpoint_to_pytorch.py
@@ -47,7 +47,7 @@ if __name__ == "__main__":
                         default = None,
                         type = str,
                         required = True,
-                        help = "Path the TensorFlow checkpoint path.")
+                        help = "Path to the TensorFlow checkpoint path.")
     parser.add_argument("--bert_config_file",
                         default = None,
                         type = str,
diff --git a/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
index 834b47484f..038c706960 100755
--- a/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_xlnet_checkpoint_to_pytorch.py
@@ -79,7 +79,7 @@ if __name__ == "__main__":
                         default = None,
                         type = str,
                         required = True,
-                        help = "Path the TensorFlow checkpoint path.")
+                        help = "Path to the TensorFlow checkpoint path.")
     parser.add_argument("--xlnet_config_file",
                         default = None,
                         type = str,

From 009273dbddd0964c378d7131445bfc0ae63bc29c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sun, 4 Aug 2019 12:14:57 +0200
Subject: [PATCH 15/35] big doc update [WIP]

---
 README.md                                     |  1 +
 docs/source/converting_tensorflow_models.rst  | 43 +++++++++++------
 docs/source/index.rst                         |  9 ++++
 docs/source/installation.rst                  | 16 +++----
 docs/source/main_classes/configuration.rst    | 10 ++++
 docs/source/main_classes/model.rst            |  8 ++++
 .../main_classes/optimizer_schedules.rst      | 26 ++++++++++
 docs/source/main_classes/tokenizer.rst        |  8 ++++
 docs/source/migration.md                      |  9 ++--
 docs/source/model_doc/bert.rst                |  6 ---
 docs/source/quickstart.md                     | 47 +++++++++++++++++--
 docs/source/serialization.rst                 |  3 ++
 pytorch_transformers/__init__.py              |  2 +-
 pytorch_transformers/modeling_utils.py        | 38 ++++++++++-----
 pytorch_transformers/tokenization_bert.py     |  2 +-
 pytorch_transformers/tokenization_gpt2.py     |  2 +-
 .../tokenization_transfo_xl.py                |  2 +-
 pytorch_transformers/tokenization_utils.py    | 15 +++---
 pytorch_transformers/tokenization_xlnet.py    |  2 +-
 19 files changed, 189 insertions(+), 60 deletions(-)
 create mode 100644 docs/source/main_classes/configuration.rst
 create mode 100644 docs/source/main_classes/model.rst
 create mode 100644 docs/source/main_classes/optimizer_schedules.rst
 create mode 100644 docs/source/main_classes/tokenizer.rst

diff --git a/README.md b/README.md
index 8e2074f727..c31bbd24b7 100644
--- a/README.md
+++ b/README.md
@@ -119,6 +119,7 @@ tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
 ```
 
 ## Quick tour of the fine-tuning/usage scripts
+
 The library comprises several example scripts with SOTA performances for NLU and NLG tasks:
 
 - `run_glue.py`: an example fine-tuning Bert, XLNet and XLM on nine different GLUE tasks (*sequence-level classification*)
diff --git a/docs/source/converting_tensorflow_models.rst b/docs/source/converting_tensorflow_models.rst
index 36c1e4050f..8441c9b1f7 100644
--- a/docs/source/converting_tensorflow_models.rst
+++ b/docs/source/converting_tensorflow_models.rst
@@ -1,7 +1,7 @@
 Converting Tensorflow Checkpoints
 ================================================
 
-A command-line interface is provided to convert a TensorFlow checkpoint in a PyTorch dump of the ``BertForPreTraining`` class  (for BERT) or NumPy checkpoint in a PyTorch dump of the ``OpenAIGPTModel`` class  (for OpenAI GPT).
+A command-line interface is provided to convert original Bert/GPT/GPT-2/Transformer-XL/XLNet/XLM checkpoints in models than be loaded using the ``from_pretrained`` methods of the library.
 
 BERT
 ^^^^
@@ -41,6 +41,20 @@ Here is an example of the conversion process for a pre-trained OpenAI GPT model,
      $PYTORCH_DUMP_OUTPUT \
      [OPENAI_GPT_CONFIG]
 
+OpenAI GPT-2
+^^^^^^^^^^^^
+
+Here is an example of the conversion process for a pre-trained OpenAI GPT-2 model (see `here <https://github.com/openai/gpt-2>`__\ )
+
+.. code-block:: shell
+
+   export OPENAI_GPT2_CHECKPOINT_PATH=/path/to/gpt2/pretrained/weights
+
+   pytorch_transformers gpt2 \
+     $OPENAI_GPT2_CHECKPOINT_PATH \
+     $PYTORCH_DUMP_OUTPUT \
+     [OPENAI_GPT2_CONFIG]
+
 Transformer-XL
 ^^^^^^^^^^^^^^
 
@@ -55,19 +69,6 @@ Here is an example of the conversion process for a pre-trained Transformer-XL mo
      $PYTORCH_DUMP_OUTPUT \
      [TRANSFO_XL_CONFIG]
 
-GPT-2
-^^^^^
-
-Here is an example of the conversion process for a pre-trained OpenAI's GPT-2 model.
-
-.. code-block:: shell
-
-   export GPT2_DIR=/path/to/gpt2/checkpoint
-
-   pytorch_transformers gpt2 \
-     $GPT2_DIR/model.ckpt \
-     $PYTORCH_DUMP_OUTPUT \
-     [GPT2_CONFIG]
 
 XLNet
 ^^^^^
@@ -84,3 +85,17 @@ Here is an example of the conversion process for a pre-trained XLNet model, fine
      $TRANSFO_XL_CONFIG_PATH \
      $PYTORCH_DUMP_OUTPUT \
      STS-B \
+
+
+XLM
+^^^
+
+Here is an example of the conversion process for a pre-trained XLM model:
+
+.. code-block:: shell
+
+   export XLM_CHECKPOINT_PATH=/path/to/xlm/checkpoint
+
+   pytorch_transformers xlm \
+     $XLM_CHECKPOINT_PATH \
+     $PYTORCH_DUMP_OUTPUT \
diff --git a/docs/source/index.rst b/docs/source/index.rst
index be8cfc2a39..c403a0ad4f 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -21,11 +21,20 @@ The library currently contains PyTorch implementations, pre-trained model weight
     pretrained_models
     examples
     notebooks
+    serialization
     converting_tensorflow_models
     migration
     bertology
     torchscript
 
+.. toctree::
+    :maxdepth: 2
+    :caption: Main classes
+
+    main_classes/configuration
+    main_classes/model
+    main_classes/tokenizer
+    main_classes/optimizer_schedules
 
 .. toctree::
     :maxdepth: 2
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index f8beb9f1c8..9e6269da94 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -1,12 +1,12 @@
 Installation
 ================================================
 
-This repo was tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 0.4.1/1.0.0
+PyTorch-Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
 
 With pip
 ^^^^^^^^
 
-PyTorch pretrained bert can be installed with pip as follows:
+PyTorch Transformers can be installed using pip as follows:
 
 .. code-block:: bash
 
@@ -15,7 +15,7 @@ PyTorch pretrained bert can be installed with pip as follows:
 From source
 ^^^^^^^^^^^
 
-Clone the repository and instal locally:
+To install from source, clone the repository and install with:
 
 .. code-block:: bash
 
@@ -27,11 +27,11 @@ Clone the repository and instal locally:
 Tests
 ^^^^^
 
-An extensive test suite is included for the library and the example scripts. Library tests can be found in the `tests folder <https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`_.
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/pytorch-transformers/tree/master/pytorch_transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`_.
 
-These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
+Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
 
-You can run the tests from the root of the cloned repository with the commands:
+Run all the tests from the root of the cloned repository with the commands:
 
 .. code-block:: bash
 
@@ -42,11 +42,11 @@ You can run the tests from the root of the cloned repository with the commands:
 OpenAI GPT original tokenization workflow
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-If you want to reproduce the original tokenization process of the ``OpenAI GPT`` paper, you will need to install ``ftfy`` (limit to version 4.4.3 if you are using Python 2) and ``SpaCy`` :
+If you want to reproduce the original tokenization process of the ``OpenAI GPT`` paper, you will need to install ``ftfy`` (use version 4.4.3 if you are using Python 2) and ``SpaCy`` :
 
 .. code-block:: bash
 
    pip install spacy ftfy==4.4.3
    python -m spacy download en
 
-If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
+If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer defaults to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
diff --git a/docs/source/main_classes/configuration.rst b/docs/source/main_classes/configuration.rst
new file mode 100644
index 0000000000..5e069629b8
--- /dev/null
+++ b/docs/source/main_classes/configuration.rst
@@ -0,0 +1,10 @@
+Configuration
+----------------------------------------------------
+
+We provide a base class, ``PretrainedConfig``, which can load a pretrained instance either from a local file or directory or from a pretrained model configuration provided by the library (downloaded from HuggingFace AWS S3 repository).
+
+``PretrainedConfig``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.PretrainedConfig
+    :members:
diff --git a/docs/source/main_classes/model.rst b/docs/source/main_classes/model.rst
new file mode 100644
index 0000000000..dd4c9d87dd
--- /dev/null
+++ b/docs/source/main_classes/model.rst
@@ -0,0 +1,8 @@
+Models
+----------------------------------------------------
+
+``PreTrainedModel``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.PreTrainedModel
+    :members:
diff --git a/docs/source/main_classes/optimizer_schedules.rst b/docs/source/main_classes/optimizer_schedules.rst
new file mode 100644
index 0000000000..2d91d495a4
--- /dev/null
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -0,0 +1,26 @@
+Optimizer
+----------------------------------------------------
+
+``AdamW``
+~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.AdamW
+    :members:
+
+Schedules
+----------------------------------------------------
+
+.. autoclass:: pytorch_transformers.ConstantLRSchedule
+    :members:
+
+.. autoclass:: pytorch_transformers.WarmupConstantSchedule
+    :members:
+
+.. autoclass:: pytorch_transformers.WarmupCosineSchedule
+    :members:
+
+.. autoclass:: pytorch_transformers.WarmupCosineWithHardRestartsSchedule
+    :members:
+
+.. autoclass:: pytorch_transformers.WarmupLinearSchedule
+    :members:
diff --git a/docs/source/main_classes/tokenizer.rst b/docs/source/main_classes/tokenizer.rst
new file mode 100644
index 0000000000..cd6b4786bb
--- /dev/null
+++ b/docs/source/main_classes/tokenizer.rst
@@ -0,0 +1,8 @@
+Tokenizer
+----------------------------------------------------
+
+``PreTrainedTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.PreTrainedTokenizer
+    :members:
diff --git a/docs/source/migration.md b/docs/source/migration.md
index fff4807d5c..ba09253472 100644
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -35,10 +35,13 @@ loss, logits, attentions = outputs
 
 ### Serialization
 
-Breaking change: Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method.
-To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
+Breaking change in the `from_pretrained()`method:
 
-Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other seralization method before.
+1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
+
+2. The additional `*inputs` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute first which can break derived model classes build based on the previous `BertForSequenceClassification` examples. More precisely, the positional arguments `*inputs` provided to `from_pretrained()` are directly forwarded the model `__init__()` method while the keyword arguments `**kwargs` (i) which match configuration class attributes are used to update said attributes (ii) which don't match any configuration class attributes are forwarded to the model `__init__()` method.
+
+Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
 
 Here is an example:
 
diff --git a/docs/source/model_doc/bert.rst b/docs/source/model_doc/bert.rst
index 8c786aa24f..cbce74e73b 100644
--- a/docs/source/model_doc/bert.rst
+++ b/docs/source/model_doc/bert.rst
@@ -15,12 +15,6 @@ BERT
     :members:
 
 
-``AdamW``
-~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_transformers.AdamW
-    :members:
-
 ``BertModel``
 ~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md
index 7414ef48c1..814021038a 100644
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -1,17 +1,58 @@
 # Quickstart
 
+## Philosophy
+
+PyTorch-Transformers is an opinionated library built for NLP researchers seeking to use/study/extend large-scale transformers models.
+
+The library was designed with two strong goals in mind:
+
+- be as easy and fast to use as possible:
+
+  - we strongly limited the number of abstractions to learn, in fact there are almost no abstractions, just three standard classes for each model: configuration, models and tokenizer,
+  - each pretrained model configuration, weights and vocabulary can be downloaded, cached and loaded in the related class in a simple way by using a common `from_pretrained()` instantiation method.
+  - this library is NOT a modular toolbox of building blocks for neural nets, to extend/build-upon the library, just use your regular Python/PyTorch modules and inherit from the base classes of the library to reuse functionalities like model loading/saving.
+
+- provide state-of-the-art models with performances as close as possible to the original models:
+
+  - we provide at least one example for each model which reproduces a result provided by the official authors of said model,
+  - the code is usually as close to the original code base as possible which means some PyTorch code may be not as *pytorchic* as it could be as a result of being converted TensorFlow code.
+
+A few other goals:
+
+- expose the models internals as consistently as possible:
+
+  - we give access, using a single API to the full hidden-states and attention weights,
+  - tokenizer and base model's API are standardized to easily switch between models.
+
+- incorporate a subjective selection of promising tools for fine-tuning/investiguating these models:
+
+  - a simple/consistent way to add new tokens to the vocabulary and embeddings for fine-tuning,
+  - simple ways to mask and prune transformer heads.
+
 ## Main concepts
 
+The library is build around three type of classes for each models:
+
+- **model classes** which are PyTorch models (`torch.nn.Modules`) of the 6 models architectures currently provided in the library, e.g. `BertModel`
+- **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`
+- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
+
+All these classes can be instantiated from pretrained instances and saved locally using two methods:
+
+- `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/pytorch-transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
+- `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
+
+Let's go through a few simple quick-start examples to see how we can instantiate and use these classes.
 
 ## Quick tour: Usage
 
-Here are two quick-start examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.
+Here are two examples showcasing a few `Bert` and `GPT2` classes and pre-trained models.
 
-See package reference for examples for each model classe.
+See full API reference for examples for each model classe.
 
 ### BERT example
 
-First let's prepare a tokenized input from a text string using `BertTokenizer`
+Let's start by preparing a tokenized input (a list of token embeddings indices to be fed to Bert) from a text string using `BertTokenizer`
 
 ```python
 import torch
diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst
index be5197135d..c0de1324cf 100644
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -1,3 +1,6 @@
+Serialization
+----------------------------------------------------
+
 ### Loading Google AI or OpenAI pre-trained weights or PyTorch dump
 
 ### `from_pretrained()` method
diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index f875e4ab18..c9b0aeebb7 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -5,7 +5,7 @@ from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
-from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization)
+from .tokenization_utils import (PreTrainedTokenizer)
 
 from .modeling_bert import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index e458c5ef74..f21927e18c 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -55,11 +55,19 @@ else:
 
 class PretrainedConfig(object):
     """ Base class for all configuration classes.
-        Handle a few common parameters and methods for loading/downloading/saving configurations.
+        Handle a few common attributes and methods for loading/downloading/saving configurations.
     """
     pretrained_config_archive_map = {}
 
     def __init__(self, **kwargs):
+        r""" The initialization of :class:`~pytorch_transformers.PretrainedConfig` extracts
+            a few configuration attributes from `**kwargs` which are common to all models:
+                - `finetuning_task`: string, default `None`. Name of the task used to fine-tune the model (used to convert from original checkpoint)
+                - `num_labels`: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
+                - `output_attentions`: boolean, default `False`. Should the model returns attentions weights.
+                - `output_hidden_states`: string, default `False`. Should the model returns all hidden-states.
+                - `torchscript`: string, default `False`. Is the model used with Torchscript.
+        """
         self.finetuning_task = kwargs.pop('finetuning_task', None)
         self.num_labels = kwargs.pop('num_labels', 2)
         self.output_attentions = kwargs.pop('output_attentions', False)
@@ -67,7 +75,7 @@ class PretrainedConfig(object):
         self.torchscript = kwargs.pop('torchscript', False)
 
     def save_pretrained(self, save_directory):
-        """ Save a configuration object to a directory, so that it
+        """ Save a configuration object to the directory `save_directory`, so that it
             can be re-loaded using the `from_pretrained(save_directory)` class method.
         """
         assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
@@ -81,30 +89,34 @@ class PretrainedConfig(object):
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
         r""" Instantiate a PretrainedConfig from a pre-trained model configuration.
 
-        Params:
+        Parameters:
             **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model configuration to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a saved configuration `file`.
+
+                - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing a configuration file saved using the `save_pretrained(save_directory)` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
+
             **cache_dir**: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
+
             **return_unused_kwargs**: (`optional`) bool:
+
                 - If False, then this function returns just the final configuration object.
-                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs`
-                is a dictionary consisting of the key/value pairs whose keys are not configuration attributes:
-                ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
+                - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
+
             **kwargs**: (`optional`) dict:
                 Dictionary of key/value pairs with which to update the configuration object after loading.
+
                 - The values in kwargs of any keys which are configuration attributes will be used
-                to override the loaded values.
+                    to override the loaded values.
                 - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
-                by the `return_unused_kwargs` keyword parameter.
+                    by the `return_unused_kwargs` keyword parameter.
 
         Examples::
 
+            # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
+            # derived class: BertConfig
             config = BertConfig.from_pretrained('bert-base-uncased')    # Download configuration from S3 and cache.
             config = BertConfig.from_pretrained('./test/saved_model/')  # E.g. config (or model) was saved using `save_pretrained('./test/saved_model/')`
             config = BertConfig.from_pretrained('./test/saved_model/my_configuration.json')
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index d9cd881dfd..d7aeff7c39 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -22,7 +22,7 @@ import os
 import unicodedata
 from io import open
 
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index 29a9ae7660..0aee856180 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -31,7 +31,7 @@ except ImportError:
     def lru_cache():
         return lambda func: func
 
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
diff --git a/pytorch_transformers/tokenization_transfo_xl.py b/pytorch_transformers/tokenization_transfo_xl.py
index 237f8ea387..992dff80d5 100644
--- a/pytorch_transformers/tokenization_transfo_xl.py
+++ b/pytorch_transformers/tokenization_transfo_xl.py
@@ -30,7 +30,7 @@ import torch
 import numpy as np
 
 from .file_utils import cached_path
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index eaef2fed1e..556f094f6d 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -444,7 +444,7 @@ class PreTrainedTokenizer(object):
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
         text = self.convert_tokens_to_string(filtered_tokens)
         if clean_up_tokenization_spaces:
-            text = clean_up_tokenization(text)
+            text = self.clean_up_tokenization(text)
         return text
 
     @property
@@ -480,10 +480,9 @@ class PreTrainedTokenizer(object):
         all_ids = list(self.convert_tokens_to_ids(t) for t in all_toks)
         return all_ids
 
-
-
-def clean_up_tokenization(out_string):
-    out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
-                    ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
-                    ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
-    return out_string
+    @staticmethod
+    def clean_up_tokenization(out_string):
+        out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
+                        ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
+                        ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")
+        return out_string
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index a4f3fdfde2..919ac97bce 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -23,7 +23,7 @@ from shutil import copyfile
 import unicodedata
 import six
 
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 

From 28ba345eccb8a7af3e044f3dd82c1d661a065d80 Mon Sep 17 00:00:00 2001
From: Ethan Perez <perez@nyu.edu>
Date: Sun, 4 Aug 2019 12:31:46 -0400
Subject: [PATCH 16/35] Fixing unused weight_decay argument

Currently the L2 regularization is hard-coded to "0.01", even though there is a --weight_decay flag implemented (that is unused). I'm making this flag control the weight decay used for fine-tuning in this script.
---
 examples/single_model_scripts/run_openai_gpt.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/single_model_scripts/run_openai_gpt.py b/examples/single_model_scripts/run_openai_gpt.py
index af737b953e..479c08782d 100644
--- a/examples/single_model_scripts/run_openai_gpt.py
+++ b/examples/single_model_scripts/run_openai_gpt.py
@@ -205,7 +205,7 @@ def main():
         param_optimizer = list(model.named_parameters())
         no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
         optimizer_grouped_parameters = [
-            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
+            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
             {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
             ]
         optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)

From 00132b7a7a79b7bed6574ad16550e50eb5af3a8f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sun, 4 Aug 2019 22:42:55 +0200
Subject: [PATCH 17/35] updating docs - adding few tests to tokenizers

---
 docs/source/index.rst                         |   1 -
 docs/source/main_classes/configuration.rst    |   2 +-
 docs/source/main_classes/model.rst            |   7 +
 .../main_classes/optimizer_schedules.rst      |  29 ++
 docs/source/main_classes/tokenizer.rst        |   8 +
 docs/source/model_doc/overview.rst            | 285 ------------------
 docs/source/quickstart.md                     |  17 +-
 docs/source/serialization.rst                 | 250 +++++++--------
 pytorch_transformers/modeling_utils.py        | 161 +++++-----
 pytorch_transformers/tokenization_utils.py    | 151 ++++++++--
 10 files changed, 390 insertions(+), 521 deletions(-)
 delete mode 100644 docs/source/model_doc/overview.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index c403a0ad4f..b80fd8437b 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -40,7 +40,6 @@ The library currently contains PyTorch implementations, pre-trained model weight
     :maxdepth: 2
     :caption: Package Reference
 
-    model_doc/overview
     model_doc/bert
     model_doc/gpt
     model_doc/transformerxl
diff --git a/docs/source/main_classes/configuration.rst b/docs/source/main_classes/configuration.rst
index 5e069629b8..5181874c1a 100644
--- a/docs/source/main_classes/configuration.rst
+++ b/docs/source/main_classes/configuration.rst
@@ -1,7 +1,7 @@
 Configuration
 ----------------------------------------------------
 
-We provide a base class, ``PretrainedConfig``, which can load a pretrained instance either from a local file or directory or from a pretrained model configuration provided by the library (downloaded from HuggingFace AWS S3 repository).
+The base class ``PretrainedConfig`` implements the common methods for loading/saving a configuration either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
 
 ``PretrainedConfig``
 ~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/source/main_classes/model.rst b/docs/source/main_classes/model.rst
index dd4c9d87dd..ba61afadf0 100644
--- a/docs/source/main_classes/model.rst
+++ b/docs/source/main_classes/model.rst
@@ -1,6 +1,13 @@
 Models
 ----------------------------------------------------
 
+The base class ``PreTrainedModel`` implements the common methods for loading/saving a model either from a local file or directory, or from a pretrained model configuration provided by the library (downloaded from HuggingFace's AWS S3 repository).
+
+``PreTrainedModel`` also implements a few methods which are common among all the models to:
+
+- resize the input token embeddings when new tokens are added to the vocabulary
+- prune the attention heads of the model.
+
 ``PreTrainedModel``
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/main_classes/optimizer_schedules.rst b/docs/source/main_classes/optimizer_schedules.rst
index 2d91d495a4..70fefb7c6d 100644
--- a/docs/source/main_classes/optimizer_schedules.rst
+++ b/docs/source/main_classes/optimizer_schedules.rst
@@ -1,6 +1,11 @@
 Optimizer
 ----------------------------------------------------
 
+The ``.optimization`` module provides:
+
+- an optimizer with weight decay fixed that can be used to fine-tuned models, and
+- several schedules in the form of schedule objects that inherit from ``_LRSchedule``:
+
 ``AdamW``
 ~~~~~~~~~~~~~~~~
 
@@ -10,17 +15,41 @@ Optimizer
 Schedules
 ----------------------------------------------------
 
+Learning Rate Schedules
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
 .. autoclass:: pytorch_transformers.ConstantLRSchedule
     :members:
 
+
 .. autoclass:: pytorch_transformers.WarmupConstantSchedule
     :members:
 
+.. image:: /imgs/warmup_constant_schedule.png
+    :target: /imgs/warmup_constant_schedule.png
+    :alt:
+
+
 .. autoclass:: pytorch_transformers.WarmupCosineSchedule
     :members:
 
+.. image:: /imgs/warmup_cosine_schedule.png
+    :target: /imgs/warmup_cosine_schedule.png
+    :alt:
+
+
 .. autoclass:: pytorch_transformers.WarmupCosineWithHardRestartsSchedule
     :members:
 
+.. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
+    :target: /imgs/warmup_cosine_hard_restarts_schedule.png
+    :alt:
+
+
+
 .. autoclass:: pytorch_transformers.WarmupLinearSchedule
     :members:
+
+.. image:: /imgs/warmup_linear_schedule.png
+    :target: /imgs/warmup_linear_schedule.png
+    :alt:
diff --git a/docs/source/main_classes/tokenizer.rst b/docs/source/main_classes/tokenizer.rst
index cd6b4786bb..12ca5522de 100644
--- a/docs/source/main_classes/tokenizer.rst
+++ b/docs/source/main_classes/tokenizer.rst
@@ -1,6 +1,14 @@
 Tokenizer
 ----------------------------------------------------
 
+The base class ``PreTrainedTokenizer`` implements the common methods for loading/saving a tokenizer either from a local file or directory, or from a pretrained tokenizer provided by the library (downloaded from HuggingFace's AWS S3 repository).
+
+``PreTrainedTokenizer`` is the main entry point into tokenizers as it also implements the main methods for using all the tokenizers:
+
+- tokenizing, converting tokens to ids and back and encoding/decoding,
+- adding new tokens to the vocabulary in a way that is independant of the underlying structure (BPE, SentencePiece...),
+- managing special tokens (adding them, assigning them to roles, making sure they are not split during tokenization)
+
 ``PreTrainedTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/source/model_doc/overview.rst b/docs/source/model_doc/overview.rst
deleted file mode 100644
index 4cca4eb846..0000000000
--- a/docs/source/model_doc/overview.rst
+++ /dev/null
@@ -1,285 +0,0 @@
-Overview
-================================================
-
-
-Here is a detailed documentation of the classes in the package and how to use them:
-
-.. list-table::
-   :header-rows: 1
-
-   * - Sub-section
-     - Description
-   * - `Loading pre-trained weights <#loading-google-ai-or-openai-pre-trained-weights-or-pytorch-dump>`__
-     - How to load Google AI/OpenAI's pre-trained weight or a PyTorch saved instance
-   * - `Serialization best-practices <#serialization-best-practices>`__
-     - How to save and reload a fine-tuned model
-   * - `Configurations <#configurations>`__
-     - API of the configuration classes for BERT, GPT, GPT-2 and Transformer-XL
-
-
-TODO Lysandre filled: Removed Models/Tokenizers/Optimizers as no single link can be made.
-
-
-Configurations
-^^^^^^^^^^^^^^
-
-Models (BERT, GPT, GPT-2 and Transformer-XL) are defined and build from configuration classes which contains the
-parameters of the models (number of layers, dimensionalities...) and a few utilities to read and write from JSON
-configuration files. The respective configuration classes are:
-
-
-* ``BertConfig`` for ``BertModel`` and BERT classes instances.
-* ``OpenAIGPTConfig`` for ``OpenAIGPTModel`` and OpenAI GPT classes instances.
-* ``GPT2Config`` for ``GPT2Model`` and OpenAI GPT-2 classes instances.
-* ``TransfoXLConfig`` for ``TransfoXLModel`` and Transformer-XL classes instances.
-
-These configuration classes contains a few utilities to load and save configurations:
-
-
-* ``from_dict(cls, json_object)``\ : A class method to construct a configuration from a Python dictionary of parameters. Returns an instance of the configuration class.
-* ``from_json_file(cls, json_file)``\ : A class method to construct a configuration from a json file of parameters. Returns an instance of the configuration class.
-* ``to_dict()``\ : Serializes an instance to a Python dictionary. Returns a dictionary.
-* ``to_json_string()``\ : Serializes an instance to a JSON string. Returns a string.
-* ``to_json_file(json_file_path)``\ : Save an instance to a json file.
-
-
-Loading Google AI or OpenAI pre-trained weights or PyTorch dump
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-``from_pretrained()`` method
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of ``BertForPreTraining`` saved with ``torch.save()``\ ), the PyTorch model classes and the tokenizer can be instantiated using the ``from_pretrained()`` method:
-
-.. code-block:: python
-
-   model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
-
-where
-
-
-* ``BERT_CLASS`` is either a tokenizer to load the vocabulary (\ ``BertTokenizer`` or ``OpenAIGPTTokenizer`` classes) or one of the eight BERT or three OpenAI GPT PyTorch model classes (to load the pre-trained weights): ``BertModel``\ , ``BertForMaskedLM``\ , ``BertForNextSentencePrediction``\ , ``BertForPreTraining``\ , ``BertForSequenceClassification``\ , ``BertForTokenClassification``\ , ``BertForMultipleChoice``\ , ``BertForQuestionAnswering``\ , ``OpenAIGPTModel``\ , ``OpenAIGPTLMHeadModel`` or ``OpenAIGPTDoubleHeadsModel``\ , and
-*
-  ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is either:
-
-
-  *
-    the shortcut name of a Google AI's or OpenAI's pre-trained model selected in the list:
-
-
-    * ``bert-base-uncased``: 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-large-uncased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
-    * ``bert-base-cased``: 12-layer, 768-hidden, 12-heads , 110M parameters
-    * ``bert-large-cased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
-    * ``bert-base-multilingual-uncased``: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-base-multilingual-cased``: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-base-chinese``: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``bert-base-german-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://deepset.ai/german-bert>`__
-    * ``bert-large-uncased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
-    * ``bert-large-cased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
-    * ``bert-large-uncased-whole-word-masking-finetuned-squad``: The ``bert-large-uncased-whole-word-masking`` model finetuned on SQuAD (using the ``run_bert_squad.py`` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
-    * ``openai-gpt``: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
-    * ``gpt2``: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
-    * ``gpt2-medium``: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
-    * ``transfo-xl-wt103``: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
-
-  *
-    a path or url to a pretrained model archive containing:
-
-
-    * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
-    * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
-
-  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
-
-*
-  ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
-
-* ``from_tf``\ : should we load the weights from a locally saved TensorFlow checkpoint
-* ``state_dict``\ : an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
-* ``*inputs``\ , `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
-
-``Uncased`` means that the text has been lowercased before WordPiece tokenization, e.g., ``John Smith`` becomes ``john smith``. The Uncased model also strips out any accent markers. ``Cased`` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the `Multilingual README <https://github.com/google-research/bert/blob/master/multilingual.md>`__ or the original TensorFlow repository.
-
-When using an ``uncased model``\ , make sure to pass ``--do_lower_case`` to the example training scripts (or pass ``do_lower_case=True`` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).
-
-Examples:
-
-.. code-block:: python
-
-   # BERT
-   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
-   model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-
-   # OpenAI GPT
-   tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-   model = OpenAIGPTModel.from_pretrained('openai-gpt')
-
-   # Transformer-XL
-   tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-   model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
-
-   # OpenAI GPT-2
-   tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-   model = GPT2Model.from_pretrained('gpt2')
-
-Cache directory
-~~~~~~~~~~~~~~~
-
-``pytorch_pretrained_bert`` save the pretrained weights in a cache directory which is located at (in this order of priority):
-
-
-* ``cache_dir`` optional arguments to the ``from_pretrained()`` method (see above),
-* shell environment variable ``PYTORCH_PRETRAINED_BERT_CACHE``\ ,
-* PyTorch cache home + ``/pytorch_pretrained_bert/``
-  where PyTorch cache home is defined by (in this order):
-
-  * shell environment variable ``ENV_TORCH_HOME``
-  * shell environment variable ``ENV_XDG_CACHE_HOME`` + ``/torch/``\ )
-  * default: ``~/.cache/torch/``
-
-Usually, if you don't set any specific environment variable, ``pytorch_pretrained_bert`` cache will be at ``~/.cache/torch/pytorch_pretrained_bert/``.
-
-You can alsways safely delete ``pytorch_pretrained_bert`` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
-
-Serialization best-practices
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
-There are three types of files you need to save to be able to reload a fine-tuned model:
-
-
-* the model it-self which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`__\ ,
-* the configuration file of the model which is saved as a JSON file, and
-* the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
-
-The *default filenames* of these files are as follow:
-
-
-* the model weights file: ``pytorch_model.bin``\ ,
-* the configuration file: ``config.json``\ ,
-* the vocabulary file: ``vocab.txt`` for BERT and Transformer-XL, ``vocab.json`` for GPT/GPT-2 (BPE vocabulary),
-* for GPT/GPT-2 (BPE vocabulary) the additional merges file: ``merges.txt``.
-
-**If you save a model using these *default filenames*\ , you can then re-load the model and tokenizer using the ``from_pretrained()`` method.**
-
-Here is the recommended way of saving the model, configuration and vocabulary to an ``output_dir`` directory and reloading the model and tokenizer afterwards:
-
-.. code-block:: python
-
-   from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
-
-   output_dir = "./models/"
-
-   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
-
-   # If we have a distributed model, save only the encapsulated model
-   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
-   model_to_save = model.module if hasattr(model, 'module') else model
-
-   # If we save using the predefined names, we can load using `from_pretrained`
-   output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
-   output_config_file = os.path.join(output_dir, CONFIG_NAME)
-
-   torch.save(model_to_save.state_dict(), output_model_file)
-   model_to_save.config.to_json_file(output_config_file)
-   tokenizer.save_vocabulary(output_dir)
-
-   # Step 2: Re-load the saved model and vocabulary
-
-   # Example for a Bert model
-   model = BertForQuestionAnswering.from_pretrained(output_dir)
-   tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)  # Add specific options if needed
-   # Example for a GPT model
-   model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
-   tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
-
-Here is another way you can save and reload the model if you want to use specific paths for each type of files:
-
-.. code-block:: python
-
-   output_model_file = "./models/my_own_model_file.bin"
-   output_config_file = "./models/my_own_config_file.bin"
-   output_vocab_file = "./models/my_own_vocab_file.bin"
-
-   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
-
-   # If we have a distributed model, save only the encapsulated model
-   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
-   model_to_save = model.module if hasattr(model, 'module') else model
-
-   torch.save(model_to_save.state_dict(), output_model_file)
-   model_to_save.config.to_json_file(output_config_file)
-   tokenizer.save_vocabulary(output_vocab_file)
-
-   # Step 2: Re-load the saved model and vocabulary
-
-   # We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`.
-   # Here is how to do it in this situation:
-
-   # Example for a Bert model
-   config = BertConfig.from_json_file(output_config_file)
-   model = BertForQuestionAnswering(config)
-   state_dict = torch.load(output_model_file)
-   model.load_state_dict(state_dict)
-   tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case)
-
-   # Example for a GPT model
-   config = OpenAIGPTConfig.from_json_file(output_config_file)
-   model = OpenAIGPTDoubleHeadsModel(config)
-   state_dict = torch.load(output_model_file)
-   model.load_state_dict(state_dict)
-   tokenizer = OpenAIGPTTokenizer(output_vocab_file)
-
-Learning Rate Schedules
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-The ``.optimization`` module also provides additional schedules in the form of schedule objects that inherit from ``_LRSchedule``.
-All ``_LRSchedule`` subclasses accept ``warmup`` and ``t_total`` arguments at construction.
-When an ``_LRSchedule`` object is passed into ``AdamW``\ ,
-the ``warmup`` and ``t_total`` arguments on the optimizer are ignored and the ones in the ``_LRSchedule`` object are used.
-An overview of the implemented schedules:
-
-
-* ``ConstantLR``\ : always returns learning rate 1.
-* ``WarmupConstantSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps. \
-    Keeps learning rate equal to 1. after warmup.
-
-  .. image:: /imgs/warmup_constant_schedule.png
-     :target: /imgs/warmup_constant_schedule.png
-     :alt:
-
-
-* ``WarmupLinearSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps. \
-    Linearly decreases learning rate from 1. to 0. over remaining ``1 - warmup`` steps.
-
-  .. image:: /imgs/warmup_linear_schedule.png
-     :target: /imgs/warmup_linear_schedule.png
-     :alt:
-
-
-* ``WarmupCosineSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps. \
-  Decreases learning rate from 1. to 0. over remaining ``1 - warmup`` steps following a cosine curve. \
-  If ``cycles`` (default=0.5) is different from default, learning rate follows cosine function after warmup.
-
-  .. image:: /imgs/warmup_cosine_schedule.png
-     :target: /imgs/warmup_cosine_schedule.png
-     :alt:
-
-
-* ``WarmupCosineWithHardRestartsSchedule`` : Linearly increases learning rate from 0 to 1 over ``warmup`` fraction of training steps.
-  If ``cycles`` (default=1.) is different from default, learning rate follows ``cycles`` times a cosine decaying learning rate (with hard restarts).
-
-  .. image:: /imgs/warmup_cosine_hard_restarts_schedule.png
-     :target: /imgs/warmup_cosine_hard_restarts_schedule.png
-     :alt:
-
-
-* ``WarmupCosineWithWarmupRestartsSchedule`` : All training progress is divided in ``cycles`` (default=1.) parts of equal length.
-  Every part follows a schedule with the first ``warmup`` fraction of the training steps linearly increasing from 0. to 1.,
-  followed by a learning rate decreasing from 1. to 0. following a cosine curve.
-  Note that the total number of all warmup steps over all cycles together is equal to ``warmup`` * ``cycles``
-
-  .. image:: /imgs/warmup_cosine_warm_restarts_schedule.png
-     :target: /imgs/warmup_cosine_warm_restarts_schedule.png
-     :alt:
\ No newline at end of file
diff --git a/docs/source/quickstart.md b/docs/source/quickstart.md
index 814021038a..f037a95a3a 100644
--- a/docs/source/quickstart.md
+++ b/docs/source/quickstart.md
@@ -8,13 +8,13 @@ The library was designed with two strong goals in mind:
 
 - be as easy and fast to use as possible:
 
-  - we strongly limited the number of abstractions to learn, in fact there are almost no abstractions, just three standard classes for each model: configuration, models and tokenizer,
-  - each pretrained model configuration, weights and vocabulary can be downloaded, cached and loaded in the related class in a simple way by using a common `from_pretrained()` instantiation method.
-  - this library is NOT a modular toolbox of building blocks for neural nets, to extend/build-upon the library, just use your regular Python/PyTorch modules and inherit from the base classes of the library to reuse functionalities like model loading/saving.
+  - we strongly limited the number of user-facing abstractions to learn, in fact there are almost no abstractions, just three standard classes required to use each model: configuration, models and tokenizer,
+  - all of these classes can be initialized in a simple and unified way from pretrained instances by using a common `from_pretrained()` instantiation method which will take care of downloading (if needed), caching and loading the related class from a pretrained instance supplied in the library or your own saved instance.
+  - as a consequence, this library is NOT a modular toolbox of building blocks for neural nets. If you want to extend/build-upon the library, just use regular Python/PyTorch modules and inherit from the base classes of the library to reuse functionalities like model loading/saving.
 
 - provide state-of-the-art models with performances as close as possible to the original models:
 
-  - we provide at least one example for each model which reproduces a result provided by the official authors of said model,
+  - we provide at least one example for each architecture which reproduces a result provided by the official authors of said architecture,
   - the code is usually as close to the original code base as possible which means some PyTorch code may be not as *pytorchic* as it could be as a result of being converted TensorFlow code.
 
 A few other goals:
@@ -34,15 +34,18 @@ A few other goals:
 The library is build around three type of classes for each models:
 
 - **model classes** which are PyTorch models (`torch.nn.Modules`) of the 6 models architectures currently provided in the library, e.g. `BertModel`
-- **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`
-- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
+- **configuration classes** which store all the parameters required to build a model, e.g. `BertConfig`. You don't always need to instantiate these your-self, in particular if you are using a pretrained model without any modification, creating the model will automatically take care of instantiating the configuration (which is part of the model)
+- **tokenizer classes** which store the vocabulary for each model and provide methods for encoding/decoding strings in list of token embeddings indices to be fed to a model, e.g. `BertTokenizer`
 
 All these classes can be instantiated from pretrained instances and saved locally using two methods:
 
 - `from_pretrained()` let you instantiate a model/configuration/tokenizer from a pretrained version either provided by the library itself (currently 27 models are provided as listed [here](https://huggingface.co/pytorch-transformers/pretrained_models.html)) or stored locally (or on a server) by the user,
 - `save_pretrained()` let you save a model/configuration/tokenizer locally so that it can be reloaded using `from_pretrained()`.
 
-Let's go through a few simple quick-start examples to see how we can instantiate and use these classes.
+We'll finish this quickstart tour by going through a few simple quick-start examples to see how we can instantiate and use these classes. The rest of the documentation is organized in two parts:
+
+- the **MAIN CLASSES** section details the common functionalities/method/attributes of the three main type of classes (configuration, model, tokenizer) plus some optimization related classes provided as utilities for training,
+- the **PACKAGE REFERENCE** section details all the variants of each class for each model architectures and in particular the input/output that you should expect when calling each of them.
 
 ## Quick tour: Usage
 
diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst
index c0de1324cf..61854f61ea 100644
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -1,174 +1,188 @@
-Serialization
-----------------------------------------------------
+Loading Google AI or OpenAI pre-trained weights or PyTorch dump
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-### Loading Google AI or OpenAI pre-trained weights or PyTorch dump
+``from_pretrained()`` method
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-### `from_pretrained()` method
+To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of ``BertForPreTraining`` saved with ``torch.save()``\ ), the PyTorch model classes and the tokenizer can be instantiated using the ``from_pretrained()`` method:
 
-To load one of Google AI's, OpenAI's pre-trained models or a PyTorch saved model (an instance of `BertForPreTraining` saved with `torch.save()`), the PyTorch model classes and the tokenizer can be instantiated using the `from_pretrained()` method:
+.. code-block:: python
 
-```python
-model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
-```
+   model = BERT_CLASS.from_pretrained(PRE_TRAINED_MODEL_NAME_OR_PATH, cache_dir=None, from_tf=False, state_dict=None, *input, **kwargs)
 
 where
 
-- `BERT_CLASS` is either a tokenizer to load the vocabulary (`BertTokenizer` or `OpenAIGPTTokenizer` classes) or one of the eight BERT or three OpenAI GPT PyTorch model classes (to load the pre-trained weights): `BertModel`, `BertForMaskedLM`, `BertForNextSentencePrediction`, `BertForPreTraining`, `BertForSequenceClassification`, `BertForTokenClassification`, `BertForMultipleChoice`, `BertForQuestionAnswering`, `OpenAIGPTModel`, `OpenAIGPTLMHeadModel` or `OpenAIGPTDoubleHeadsModel`, and
-- `PRE_TRAINED_MODEL_NAME_OR_PATH` is either:
 
-  - the shortcut name of a Google AI's or OpenAI's pre-trained model selected in the list:
+* ``BERT_CLASS`` is either a tokenizer to load the vocabulary (\ ``BertTokenizer`` or ``OpenAIGPTTokenizer`` classes) or one of the eight BERT or three OpenAI GPT PyTorch model classes (to load the pre-trained weights): ``BertModel``\ , ``BertForMaskedLM``\ , ``BertForNextSentencePrediction``\ , ``BertForPreTraining``\ , ``BertForSequenceClassification``\ , ``BertForTokenClassification``\ , ``BertForMultipleChoice``\ , ``BertForQuestionAnswering``\ , ``OpenAIGPTModel``\ , ``OpenAIGPTLMHeadModel`` or ``OpenAIGPTDoubleHeadsModel``\ , and
+*
+  ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is either:
 
-    - `bert-base-uncased`: 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `bert-large-uncased`: 24-layer, 1024-hidden, 16-heads, 340M parameters
-    - `bert-base-cased`: 12-layer, 768-hidden, 12-heads , 110M parameters
-    - `bert-large-cased`: 24-layer, 1024-hidden, 16-heads, 340M parameters
-    - `bert-base-multilingual-uncased`: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `bert-base-multilingual-cased`: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `bert-base-chinese`: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `bert-base-german-cased`: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters [Performance Evaluation](https://deepset.ai/german-bert)
-    - `bert-large-uncased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
-    - `bert-large-cased-whole-word-masking`: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
-    - `bert-large-uncased-whole-word-masking-finetuned-squad`: The `bert-large-uncased-whole-word-masking` model finetuned on SQuAD (using the `run_bert_squad.py` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
-    - `openai-gpt`: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
-    - `gpt2`: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
-    - `gpt2-medium`: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
-    - `transfo-xl-wt103`: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
 
-  - a path or url to a pretrained model archive containing:
+  *
+    the shortcut name of a Google AI's or OpenAI's pre-trained model selected in the list:
 
-    - `bert_config.json` or `openai_gpt_config.json` a configuration file for the model, and
-    - `pytorch_model.bin` a PyTorch dump of a pre-trained instance of `BertForPreTraining`, `OpenAIGPTModel`, `TransfoXLModel`, `GPT2LMHeadModel` (saved with the usual `torch.save()`)
 
-  If `PRE_TRAINED_MODEL_NAME_OR_PATH` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links [here](pytorch_transformers/modeling.py)) and stored in a cache folder to avoid future download (the cache folder can be found at `~/.pytorch_transformers/`).
+    * ``bert-base-uncased``: 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``bert-large-uncased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
+    * ``bert-base-cased``: 12-layer, 768-hidden, 12-heads , 110M parameters
+    * ``bert-large-cased``: 24-layer, 1024-hidden, 16-heads, 340M parameters
+    * ``bert-base-multilingual-uncased``: (Orig, not recommended) 102 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``bert-base-multilingual-cased``: **(New, recommended)** 104 languages, 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``bert-base-chinese``: Chinese Simplified and Traditional, 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``bert-base-german-cased``: Trained on German data only, 12-layer, 768-hidden, 12-heads, 110M parameters `Performance Evaluation <https://deepset.ai/german-bert>`__
+    * ``bert-large-uncased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    * ``bert-large-cased-whole-word-masking``: 24-layer, 1024-hidden, 16-heads, 340M parameters - Trained with Whole Word Masking (mask all of the the tokens corresponding to a word at once)
+    * ``bert-large-uncased-whole-word-masking-finetuned-squad``: The ``bert-large-uncased-whole-word-masking`` model finetuned on SQuAD (using the ``run_bert_squad.py`` examples). Results: *exact_match: 86.91579943235573, f1: 93.1532499015869*
+    * ``openai-gpt``: OpenAI GPT English model, 12-layer, 768-hidden, 12-heads, 110M parameters
+    * ``gpt2``: OpenAI GPT-2 English model, 12-layer, 768-hidden, 12-heads, 117M parameters
+    * ``gpt2-medium``: OpenAI GPT-2 English model, 24-layer, 1024-hidden, 16-heads, 345M parameters
+    * ``transfo-xl-wt103``: Transformer-XL English model trained on wikitext-103, 18-layer, 1024-hidden, 16-heads, 257M parameters
 
-- `cache_dir` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example `cache_dir='./pretrained_model_{}'.format(args.local_rank)` (see the section on distributed training for more information).
-- `from_tf`: should we load the weights from a locally saved TensorFlow checkpoint
-- `state_dict`: an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
-- `*inputs`, `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
+  *
+    a path or url to a pretrained model archive containing:
 
-`Uncased` means that the text has been lowercased before WordPiece tokenization, e.g., `John Smith` becomes `john smith`. The Uncased model also strips out any accent markers. `Cased` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the [Multilingual README](https://github.com/google-research/bert/blob/master/multilingual.md) or the original TensorFlow repository.
 
-**When using an `uncased model`, make sure to pass `--do_lower_case` to the example training scripts (or pass `do_lower_case=True` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).**
+    * ``bert_config.json`` or ``openai_gpt_config.json`` a configuration file for the model, and
+    * ``pytorch_model.bin`` a PyTorch dump of a pre-trained instance of ``BertForPreTraining``\ , ``OpenAIGPTModel``\ , ``TransfoXLModel``\ , ``GPT2LMHeadModel`` (saved with the usual ``torch.save()``\ )
+
+  If ``PRE_TRAINED_MODEL_NAME_OR_PATH`` is a shortcut name, the pre-trained weights will be downloaded from AWS S3 (see the links `here <https://github.com/huggingface/pytorch-transformers/blob/master/pytorch_transformers/modeling_bert.py>`__\ ) and stored in a cache folder to avoid future download (the cache folder can be found at ``~/.pytorch_pretrained_bert/``\ ).
+
+*
+  ``cache_dir`` can be an optional path to a specific directory to download and cache the pre-trained model weights. This option is useful in particular when you are using distributed training: to avoid concurrent access to the same weights you can set for example ``cache_dir='./pretrained_model_{}'.format(args.local_rank)`` (see the section on distributed training for more information).
+
+* ``from_tf``\ : should we load the weights from a locally saved TensorFlow checkpoint
+* ``state_dict``\ : an optional state dictionary (collections.OrderedDict object) to use instead of Google pre-trained models
+* ``*inputs``\ , `**kwargs`: additional input for the specific Bert class (ex: num_labels for BertForSequenceClassification)
+
+``Uncased`` means that the text has been lowercased before WordPiece tokenization, e.g., ``John Smith`` becomes ``john smith``. The Uncased model also strips out any accent markers. ``Cased`` means that the true case and accent markers are preserved. Typically, the Uncased model is better unless you know that case information is important for your task (e.g., Named Entity Recognition or Part-of-Speech tagging). For information about the Multilingual and Chinese model, see the `Multilingual README <https://github.com/google-research/bert/blob/master/multilingual.md>`__ or the original TensorFlow repository.
+
+When using an ``uncased model``\ , make sure to pass ``--do_lower_case`` to the example training scripts (or pass ``do_lower_case=True`` to FullTokenizer if you're using your own script and loading the tokenizer your-self.).
 
 Examples:
 
-```python
-# BERT
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
-model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+.. code-block:: python
 
-# OpenAI GPT
-tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-model = OpenAIGPTModel.from_pretrained('openai-gpt')
+   # BERT
+   tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True, do_basic_tokenize=True)
+   model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
 
-# Transformer-XL
-tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
+   # OpenAI GPT
+   tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+   model = OpenAIGPTModel.from_pretrained('openai-gpt')
 
-# OpenAI GPT-2
-tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-model = GPT2Model.from_pretrained('gpt2')
+   # Transformer-XL
+   tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+   model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
 
-```
+   # OpenAI GPT-2
+   tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+   model = GPT2Model.from_pretrained('gpt2')
 
-#### Cache directory
+Cache directory
+~~~~~~~~~~~~~~~
 
-`pytorch_transformers` save the pretrained weights in a cache directory which is located at (in this order of priority):
+``pytorch_pretrained_bert`` save the pretrained weights in a cache directory which is located at (in this order of priority):
 
-- `cache_dir` optional arguments to the `from_pretrained()` method (see above),
-- shell environment variable `PYTORCH_PRETRAINED_BERT_CACHE`,
-- PyTorch cache home + `/pytorch_transformers/`
+
+* ``cache_dir`` optional arguments to the ``from_pretrained()`` method (see above),
+* shell environment variable ``PYTORCH_PRETRAINED_BERT_CACHE``\ ,
+* PyTorch cache home + ``/pytorch_pretrained_bert/``
   where PyTorch cache home is defined by (in this order):
-  - shell environment variable `ENV_TORCH_HOME`
-  - shell environment variable `ENV_XDG_CACHE_HOME` + `/torch/`)
-  - default: `~/.cache/torch/`
 
-Usually, if you don't set any specific environment variable, `pytorch_transformers` cache will be at `~/.cache/torch/pytorch_transformers/`.
+  * shell environment variable ``ENV_TORCH_HOME``
+  * shell environment variable ``ENV_XDG_CACHE_HOME`` + ``/torch/``\ )
+  * default: ``~/.cache/torch/``
 
-You can alsways safely delete `pytorch_transformers` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
+Usually, if you don't set any specific environment variable, ``pytorch_pretrained_bert`` cache will be at ``~/.cache/torch/pytorch_pretrained_bert/``.
 
-### Serialization best-practices
+You can alsways safely delete ``pytorch_pretrained_bert`` cache but the pretrained model weights and vocabulary files wil have to be re-downloaded from our S3.
+
+Serialization best-practices
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 This section explain how you can save and re-load a fine-tuned model (BERT, GPT, GPT-2 and Transformer-XL).
 There are three types of files you need to save to be able to reload a fine-tuned model:
 
-- the model it-self which should be saved following PyTorch serialization [best practices](https://pytorch.org/docs/stable/notes/serialization.html#best-practices),
-- the configuration file of the model which is saved as a JSON file, and
-- the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
+
+* the model it-self which should be saved following PyTorch serialization `best practices <https://pytorch.org/docs/stable/notes/serialization.html#best-practices>`__\ ,
+* the configuration file of the model which is saved as a JSON file, and
+* the vocabulary (and the merges for the BPE-based models GPT and GPT-2).
 
 The *default filenames* of these files are as follow:
 
-- the model weights file: `pytorch_model.bin`,
-- the configuration file: `config.json`,
-- the vocabulary file: `vocab.txt` for BERT and Transformer-XL, `vocab.json` for GPT/GPT-2 (BPE vocabulary),
-- for GPT/GPT-2 (BPE vocabulary) the additional merges file: `merges.txt`.
 
-**If you save a model using these *default filenames*, you can then re-load the model and tokenizer using the `from_pretrained()` method.**
+* the model weights file: ``pytorch_model.bin``\ ,
+* the configuration file: ``config.json``\ ,
+* the vocabulary file: ``vocab.txt`` for BERT and Transformer-XL, ``vocab.json`` for GPT/GPT-2 (BPE vocabulary),
+* for GPT/GPT-2 (BPE vocabulary) the additional merges file: ``merges.txt``.
 
-Here is the recommended way of saving the model, configuration and vocabulary to an `output_dir` directory and reloading the model and tokenizer afterwards:
+**If you save a model using these *default filenames*\ , you can then re-load the model and tokenizer using the ``from_pretrained()`` method.**
 
-```python
-from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
+Here is the recommended way of saving the model, configuration and vocabulary to an ``output_dir`` directory and reloading the model and tokenizer afterwards:
 
-output_dir = "./models/"
+.. code-block:: python
 
-# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
+   from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
 
-# If we have a distributed model, save only the encapsulated model
-# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
-model_to_save = model.module if hasattr(model, 'module') else model
+   output_dir = "./models/"
 
-# If we save using the predefined names, we can load using `from_pretrained`
-output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
-output_config_file = os.path.join(output_dir, CONFIG_NAME)
+   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
 
-torch.save(model_to_save.state_dict(), output_model_file)
-model_to_save.config.to_json_file(output_config_file)
-tokenizer.save_vocabulary(output_dir)
+   # If we have a distributed model, save only the encapsulated model
+   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
+   model_to_save = model.module if hasattr(model, 'module') else model
 
-# Step 2: Re-load the saved model and vocabulary
+   # If we save using the predefined names, we can load using `from_pretrained`
+   output_model_file = os.path.join(output_dir, WEIGHTS_NAME)
+   output_config_file = os.path.join(output_dir, CONFIG_NAME)
 
-# Example for a Bert model
-model = BertForQuestionAnswering.from_pretrained(output_dir)
-tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)  # Add specific options if needed
-# Example for a GPT model
-model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
-tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
-```
+   torch.save(model_to_save.state_dict(), output_model_file)
+   model_to_save.config.to_json_file(output_config_file)
+   tokenizer.save_vocabulary(output_dir)
+
+   # Step 2: Re-load the saved model and vocabulary
+
+   # Example for a Bert model
+   model = BertForQuestionAnswering.from_pretrained(output_dir)
+   tokenizer = BertTokenizer.from_pretrained(output_dir, do_lower_case=args.do_lower_case)  # Add specific options if needed
+   # Example for a GPT model
+   model = OpenAIGPTDoubleHeadsModel.from_pretrained(output_dir)
+   tokenizer = OpenAIGPTTokenizer.from_pretrained(output_dir)
 
 Here is another way you can save and reload the model if you want to use specific paths for each type of files:
 
-```python
-output_model_file = "./models/my_own_model_file.bin"
-output_config_file = "./models/my_own_config_file.bin"
-output_vocab_file = "./models/my_own_vocab_file.bin"
+.. code-block:: python
 
-# Step 1: Save a model, configuration and vocabulary that you have fine-tuned
+   output_model_file = "./models/my_own_model_file.bin"
+   output_config_file = "./models/my_own_config_file.bin"
+   output_vocab_file = "./models/my_own_vocab_file.bin"
 
-# If we have a distributed model, save only the encapsulated model
-# (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
-model_to_save = model.module if hasattr(model, 'module') else model
+   # Step 1: Save a model, configuration and vocabulary that you have fine-tuned
 
-torch.save(model_to_save.state_dict(), output_model_file)
-model_to_save.config.to_json_file(output_config_file)
-tokenizer.save_vocabulary(output_vocab_file)
+   # If we have a distributed model, save only the encapsulated model
+   # (it was wrapped in PyTorch DistributedDataParallel or DataParallel)
+   model_to_save = model.module if hasattr(model, 'module') else model
 
-# Step 2: Re-load the saved model and vocabulary
+   torch.save(model_to_save.state_dict(), output_model_file)
+   model_to_save.config.to_json_file(output_config_file)
+   tokenizer.save_vocabulary(output_vocab_file)
 
-# We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`.
-# Here is how to do it in this situation:
+   # Step 2: Re-load the saved model and vocabulary
 
-# Example for a Bert model
-config = BertConfig.from_json_file(output_config_file)
-model = BertForQuestionAnswering(config)
-state_dict = torch.load(output_model_file)
-model.load_state_dict(state_dict)
-tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case)
+   # We didn't save using the predefined WEIGHTS_NAME, CONFIG_NAME names, we cannot load using `from_pretrained`.
+   # Here is how to do it in this situation:
+
+   # Example for a Bert model
+   config = BertConfig.from_json_file(output_config_file)
+   model = BertForQuestionAnswering(config)
+   state_dict = torch.load(output_model_file)
+   model.load_state_dict(state_dict)
+   tokenizer = BertTokenizer(output_vocab_file, do_lower_case=args.do_lower_case)
+
+   # Example for a GPT model
+   config = OpenAIGPTConfig.from_json_file(output_config_file)
+   model = OpenAIGPTDoubleHeadsModel(config)
+   state_dict = torch.load(output_model_file)
+   model.load_state_dict(state_dict)
+   tokenizer = OpenAIGPTTokenizer(output_vocab_file)
 
-# Example for a GPT model
-config = OpenAIGPTConfig.from_json_file(output_config_file)
-model = OpenAIGPTDoubleHeadsModel(config)
-state_dict = torch.load(output_model_file)
-model.load_state_dict(state_dict)
-tokenizer = OpenAIGPTTokenizer(output_vocab_file)
-```
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index f21927e18c..8970cd56f8 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -54,20 +54,22 @@ else:
 
 
 class PretrainedConfig(object):
-    """ Base class for all configuration classes.
-        Handle a few common attributes and methods for loading/downloading/saving configurations.
+    r""" Base class for all configuration classes.
+        Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
+
+        Class attributes (overridden by derived classes):
+            - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
+
+        Parameters:
+            ``finetuning_task``: string, default `None`. Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
+            ``num_labels``: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
+            ``output_attentions``: boolean, default `False`. Should the model returns attentions weights.
+            ``output_hidden_states``: string, default `False`. Should the model returns all hidden-states.
+            ``torchscript``: string, default `False`. Is the model used with Torchscript.
     """
     pretrained_config_archive_map = {}
 
     def __init__(self, **kwargs):
-        r""" The initialization of :class:`~pytorch_transformers.PretrainedConfig` extracts
-            a few configuration attributes from `**kwargs` which are common to all models:
-                - `finetuning_task`: string, default `None`. Name of the task used to fine-tune the model (used to convert from original checkpoint)
-                - `num_labels`: integer, default `2`. Number of classes to use when the model is a classification model (sequences/tokens)
-                - `output_attentions`: boolean, default `False`. Should the model returns attentions weights.
-                - `output_hidden_states`: string, default `False`. Should the model returns all hidden-states.
-                - `torchscript`: string, default `False`. Is the model used with Torchscript.
-        """
         self.finetuning_task = kwargs.pop('finetuning_task', None)
         self.num_labels = kwargs.pop('num_labels', 2)
         self.output_attentions = kwargs.pop('output_attentions', False)
@@ -76,7 +78,7 @@ class PretrainedConfig(object):
 
     def save_pretrained(self, save_directory):
         """ Save a configuration object to the directory `save_directory`, so that it
-            can be re-loaded using the `from_pretrained(save_directory)` class method.
+            can be re-loaded using the :func:`~pytorch_transformers.PretrainedConfig.from_pretrained` class method.
         """
         assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
 
@@ -87,32 +89,29 @@ class PretrainedConfig(object):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
-        r""" Instantiate a PretrainedConfig from a pre-trained model configuration.
+        r""" Instantiate a :class:`~pytorch_transformers.PretrainedConfig` (or a derived class) from a pre-trained model configuration.
 
         Parameters:
-            **pretrained_model_name_or_path**: either:
+            pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing a configuration file saved using the `save_pretrained(save_directory)` method, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing a configuration file saved using the :func:`~pytorch_transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
 
-            **cache_dir**: (`optional`) string:
+            cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
 
-            **return_unused_kwargs**: (`optional`) bool:
+            kwargs: (`optional`) dict: key/value pairs with which to update the configuration object after loading.
+
+                - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            return_unused_kwargs: (`optional`) bool:
 
                 - If False, then this function returns just the final configuration object.
                 - If True, then this functions returns a tuple `(config, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not configuration attributes: ie the part of kwargs which has not been used to update `config` and is otherwise ignored.
 
-            **kwargs**: (`optional`) dict:
-                Dictionary of key/value pairs with which to update the configuration object after loading.
-
-                - The values in kwargs of any keys which are configuration attributes will be used
-                    to override the loaded values.
-                - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled
-                    by the `return_unused_kwargs` keyword parameter.
-
         Examples::
 
             # We can't instantiate directly the base class `PretrainedConfig` so let's show the examples on a
@@ -215,14 +214,26 @@ class PretrainedConfig(object):
 
 
 class PreTrainedModel(nn.Module):
-    """ Base class for all models. Handle loading/storing model config and
-        a simple interface for dowloading and loading pretrained models.
+    r""" Base class for all models.
+
+        :class:`~pytorch_transformers.PreTrainedModel` takes care of storing the configuration of the models and handles methods for loading/downloading/saving models
+        as well as a few methods commons to all models to (i) resize the input embeddings and (ii) prune heads in the self-attention heads.
+
+        Class attributes (overridden by derived classes):
+            - ``config_class``: a class derived from :class:`~pytorch_transformers.PretrainedConfig` to use as configuration class for this model architecture.
+            - ``pretrained_model_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained weights as values.
+            - ``load_tf_weights``: a python ``method`` for loading a TensorFlow checkpoint in a PyTorch model, taking as arguments:
+
+                - ``model``: an instance of the relevant subclass of :class:`~pytorch_transformers.PreTrainedModel`,
+                - ``config``: an instance of the relevant subclass of :class:`~pytorch_transformers.PretrainedConfig`,
+                - ``path``: a path (string) to the TensorFlow checkpoint.
+
+            - ``base_model_prefix``: a string indicating the attribute associated to the base model in derived classes of the same architecture adding modules on top of the base model.
     """
-    config_class = PretrainedConfig
+    config_class = None
     pretrained_model_archive_map = {}
     load_tf_weights = lambda model, config, path: None
     base_model_prefix = ""
-    input_embeddings = None
 
     def __init__(self, config, *inputs, **kwargs):
         super(PreTrainedModel, self).__init__()
@@ -280,17 +291,16 @@ class PreTrainedModel(nn.Module):
 
     def resize_token_embeddings(self, new_num_tokens=None):
         """ Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size.
-            Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
+        Take care of tying weights embeddings afterwards if the model class has a `tie_weights()` method.
 
-        Args:
-            new_num_tokens: (`optional`) int
-                New number of tokens in the embedding matrix.
-                Increasing the size will add newly initialized vectors at the end
-                Reducing the size will remove vectors from the end
-                If not provided or None: does nothing and just returns a pointer to the input tokens Embedding Module of the model.
+        Arguments:
+
+            new_num_tokens: (`optional`) int:
+                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at the end. Reducing the size will remove vectors from the end. 
+                If not provided or None: does nothing and just returns a pointer to the input tokens ``torch.nn.Embeddings`` Module of the model.
 
         Return: ``torch.nn.Embeddings``
-            Pointer to the input tokens Embedding Module of the model
+            Pointer to the input tokens Embeddings Module of the model
         """
         base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
         model_embeds = base_model._resize_token_embeddings(new_num_tokens)
@@ -309,15 +319,17 @@ class PreTrainedModel(nn.Module):
 
     def prune_heads(self, heads_to_prune):
         """ Prunes heads of the base model.
-            Args:
-                heads_to_prune: dict of {layer_num (int): list of heads to prune in this layer (list of int)}
+
+            Arguments:
+
+                heads_to_prune: dict with keys being selected layer indices (`int`) and associated values being the list of heads to prune in said layer (list of `int`).
         """
         base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
         base_model._prune_heads(heads_to_prune)
 
     def save_pretrained(self, save_directory):
-        """ Save a model with its configuration file to a directory, so that it
-            can be re-loaded using the `from_pretrained(save_directory)` class method.
+        """ Save a model and its configuration file to a directory, so that it
+            can be re-loaded using the `:func:`~pytorch_transformers.PreTrainedModel.from_pretrained`` class method.
         """
         assert os.path.isdir(save_directory), "Saving path should be a directory where the model and configuration can be saved"
 
@@ -336,50 +348,45 @@ class PreTrainedModel(nn.Module):
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         r"""Instantiate a pretrained pytorch model from a pre-trained model configuration.
 
-            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-            To train the model, you should first set it back in training mode with `model.train()`
+        The model is set in evaluation mode by default using ``model.eval()`` (Dropout modules are deactivated)
+        To train the model, you should first set it back in training mode with ``model.train()``
 
-        Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
-                    In this case, ``from_tf`` should be set to True and a configuration object should be
-                    provided as `config` argument. This loading option is slower than converting the TensorFlow
-                    checkpoint in a PyTorch model using the provided conversion scripts and loading
-                    the PyTorch model afterwards.
-            **model_args**: (`optional`) Sequence:
-                All remaning positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuation.
-                Configuration can be automatically loaded when:
-                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
-            **state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
-                from saved weights file.
+        Parameters:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~pytorch_transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
                 This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
-                a simpler option.
-            **cache_dir**: (`optional`) string:
+                In this case though, you should check if using :func:`~pytorch_transformers.PreTrainedModel.save_pretrained` and :func:`~pytorch_transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
-            **output_loading_info**: (`optional`) boolean:
+
+            output_loading_info: (`optional`) boolean:
                 Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key, values to update the configuration object after loading.
-                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
 
-               - If a configuration is providedictionaryfig`, **kwargs will be directly passed
-                 to the underlying model's __init__ method.
-               - If a configuration is not provided, **kwargs will be first passed to the pretrained
-                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
-                 Each key of **kwargs that corresponds to a configuration attribute
-                 will be used to override said attribute with the supplied **kwargs value.
-                 Remaining keys that do not correspond to any configuration attribute will
-                 be passed to the underlying model's __init__ function.
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
-        Examples::dictionary
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~pytorch_transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
 
             model = BertModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
             model = BertModel.from_pretrained('./test/saved_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 556f094f6d..1852d74021 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -30,14 +30,34 @@ SPECIAL_TOKENS_MAP_FILE = 'special_tokens_map.json'
 ADDED_TOKENS_FILE = 'added_tokens.json'
 
 class PreTrainedTokenizer(object):
-    """ An abstract class to handle dowloading and loading pretrained tokenizers and adding tokens to the vocabulary.
+    """ Base class for all tokenizers.
+    Handle all the shared methods for tokenization and special tokens as well as methods dowloading/caching/loading pretrained tokenizers as well as adding tokens to the vocabulary.
 
-        Derived class can set up a few special tokens to be used in common scripts and internals:
-            bos_token, eos_token, EOP_TOKEN, EOD_TOKEN, unk_token, sep_token, pad_token, cls_token, mask_token
-            additional_special_tokens = []
+    This class also contain the added tokens in a unified way on top of all tokenizers so we don't have to handle the specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
 
-        We defined an added_tokens_encoder to add new tokens to the vocabulary without having to handle the
-            specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
+    Class attributes (overridden by derived classes):
+
+        - ``vocab_files_names``: a python ``dict`` with, as keys, the ``__init__`` keyword name of each vocabulary file required by the model, and as associated values, the filename for saving the associated file (string).
+        - ``pretrained_vocab_files_map``: a python ``dict of dict`` the high-level keys being the ``__init__`` keyword name of each vocabulary file required by the model, the low-level being the `short-cut-names` (string) of the pretrained models with, as associated values, the `url` (string) to the associated pretrained vocabulary file.
+        - ``max_model_input_sizes``: a python ``dict`` with, as keys, the `short-cut-names` (string) of the pretrained models, and as associated values, the maximum length of the sequence inputs of this model, or None if the model has no maximum input size.
+
+    Parameters:
+
+        - ``bos_token``: (`Optional`) string: a beginning of sentence token. Will be associated to ``self.bos_token``
+
+        - ``eos_token``: (`Optional`) string: an end of sentence token. Will be associated to ``self.eos_token``
+
+        - ``unk_token``: (`Optional`) string: an unknown token. Will be associated to ``self.unk_token``
+
+        - ``sep_token``: (`Optional`) string: a separation token (e.g. to separate context and query in an input sequence). Will be associated to ``self.sep_token``
+
+        - ``pad_token``: (`Optional`) string: a padding token. Will be associated to ``self.pad_token``
+
+        - ``cls_token``: (`Optional`) string: a classification token (e.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model). Will be associated to ``self.cls_token``
+
+        - ``mask_token``: (`Optional`) string: a masking token (e.g. when training a model with masked-language modeling). Will be associated to ``self.mask_token``
+
+        - ``additional_special_tokens``: (`Optional`) list: a list of additional special tokens. Adding all special tokens here ensure they won't be split by the tokenization process. Will be associated to ``self.additional_special_tokens``
     """
     vocab_files_names = {}
     pretrained_vocab_files_map = {}
@@ -49,82 +69,98 @@ class PreTrainedTokenizer(object):
 
     @property
     def bos_token(self):
+        """ Beginning of sentence token (string). Log an error if used while not having been set. """
         if self._bos_token is None:
             logger.error("Using bos_token, but it is not set yet.")
         return self._bos_token
 
     @property
     def eos_token(self):
+        """ End of sentence token (string). Log an error if used while not having been set. """
         if self._eos_token is None:
             logger.error("Using eos_token, but it is not set yet.")
         return self._eos_token
 
     @property
     def unk_token(self):
+        """ Unknown token (string). Log an error if used while not having been set. """
         if self._unk_token is None:
             logger.error("Using unk_token, but it is not set yet.")
         return self._unk_token
 
     @property
     def sep_token(self):
+        """ Separation token (string). E.g. separate context and query in an input sequence. Log an error if used while not having been set. """
         if self._sep_token is None:
             logger.error("Using sep_token, but it is not set yet.")
         return self._sep_token
 
     @property
     def pad_token(self):
+        """ Padding token (string). Log an error if used while not having been set. """
         if self._pad_token is None:
             logger.error("Using pad_token, but it is not set yet.")
         return self._pad_token
 
     @property
     def cls_token(self):
+        """ Classification token (string). E.g. to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """
         if self._cls_token is None:
             logger.error("Using cls_token, but it is not set yet.")
         return self._cls_token
 
     @property
     def mask_token(self):
+        """ Mask token (string). E.g. when training a model with masked-language modeling. Log an error if used while not having been set. """
         if self._mask_token is None:
             logger.error("Using mask_token, but it is not set yet.")
         return self._mask_token
 
     @property
     def additional_special_tokens(self):
+        """ All the additional special tokens you may want to use (list of strings). Log an error if used while not having been set. """
         if self._additional_special_tokens is None:
             logger.error("Using additional_special_tokens, but it is not set yet.")
         return self._additional_special_tokens
 
     @bos_token.setter
     def bos_token(self, value):
+        self.add_tokens([value]) 
         self._bos_token = value
 
     @eos_token.setter
     def eos_token(self, value):
+        self.add_tokens([value]) 
         self._eos_token = value
 
     @unk_token.setter
     def unk_token(self, value):
+        self.add_tokens([value]) 
         self._unk_token = value
 
     @sep_token.setter
     def sep_token(self, value):
+        self.add_tokens([value]) 
         self._sep_token = value
 
     @pad_token.setter
     def pad_token(self, value):
+        self.add_tokens([value]) 
         self._pad_token = value
 
     @cls_token.setter
     def cls_token(self, value):
+        self.add_tokens([value]) 
         self._cls_token = value
 
     @mask_token.setter
     def mask_token(self, value):
+        self.add_tokens([value]) 
         self._mask_token = value
 
     @additional_special_tokens.setter
     def additional_special_tokens(self, value):
+        self.add_tokens(value) 
         self._additional_special_tokens = value
 
     def __init__(self, max_len=None, **kwargs):
@@ -148,15 +184,47 @@ class PreTrainedTokenizer(object):
 
     @classmethod
     def from_pretrained(cls, *inputs, **kwargs):
+        r""" Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
+
+        Parameters:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
+
+            inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
+
+            kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
+
+        Examples::
+
+            # We can't instantiate directly the base class `PreTrainedTokenizer` so let's show our examples on a derived class: BertTokenizer
+
+            # Download vocabulary from S3 and cache.
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+
+            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
+            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
+
+            # If the tokenizer uses a single vocabulary file, you can point directly to this file
+            tokenizer = BertTokenizer.from_pretrained('./test/saved_model/my_vocab.txt')
+
+            # You can link tokens to special vocabulary when instantiating
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', unk_token='<unk>')
+            # You should be sure '<unk>' is in the vocabulary when doing that.
+            # Otherwise use tokenizer.add_special_tokens({'unk_token': '<unk>'}) instead)
+            assert tokenizer.unk_token == '<unk>'
+
+        """
         return cls._from_pretrained(*inputs, **kwargs)
 
 
     @classmethod
     def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
-        """
-        Instantiate a PreTrainedTokenizer from pre-trained vocabulary files.
-        Download and cache the vocabulary files if needed.
-        """
         cache_dir = kwargs.pop('cache_dir', None)
 
         s3_models = list(cls.max_model_input_sizes.keys())
@@ -253,8 +321,9 @@ class PreTrainedTokenizer(object):
 
     def save_pretrained(self, save_directory):
         """ Save the tokenizer vocabulary files (with added tokens) and the
-            special-tokens-to-class-attributes-mapping to a directory, so that it
-            can be re-loaded using the `from_pretrained(save_directory)` class method.
+            special-tokens-to-class-attributes-mapping to a directory.
+
+            This method make sure the full tokenizer can then be re-loaded using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
         """
         if not os.path.isdir(save_directory):
             logger.error("Saving directory ({}) should be a directory".format(save_directory))
@@ -279,37 +348,50 @@ class PreTrainedTokenizer(object):
 
 
     def save_vocabulary(self, save_directory):
-        """ Save the tokenizer vocabulary to a directory. This method doesn't save added tokens
+        """ Save the tokenizer vocabulary to a directory. This method does *NOT* save added tokens
             and special token mappings.
-            
-            Please use `save_pretrained()` to save the full Tokenizer state so that it can be
-            reloaded using the `from_pretrained(save_directory)` class method.
+
+            Please use :func:`~pytorch_transformers.PreTrainedTokenizer.save_pretrained` `()` to save the full Tokenizer state if you want to reload it using the :func:`~pytorch_transformers.PreTrainedTokenizer.from_pretrained` class method.
         """
         raise NotImplementedError
 
 
     def vocab_size(self):
+        """ Size of the base vocabulary (without the added tokens) """
         raise NotImplementedError
 
 
     def __len__(self):
+        """ Size of the full vocabulary with the added tokens """
         return self.vocab_size + len(self.added_tokens_encoder)
 
 
     def add_tokens(self, new_tokens):
         """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the
-            vocabulary, they are added to the added_tokens_encoder with indices starting from
-            the last index of the current vocabulary.
+        vocabulary, they are added to it with indices starting from length of the current vocabulary.
+
+            Parameters:
+                new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
 
             Returns:
-                Number of tokens added to the vocabulary which can be used to correspondingly
-                    increase the size of the associated model embedding matrices.
+                Number of tokens added to the vocabulary.
+
+        Examples::
+
+            # Let's see how to increase the vocabulary of Bert model and tokenizer
+            tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+            model = BertModel.from_pretrained('bert-base-uncased')
+
+            num_added_toks = tokenizer.add_tokens(['new_tok1', 'my_new-tok2'])
+            print('We have added', num_added_toks, 'tokens')
+            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
         """
         if not new_tokens:
             return 0
 
         to_add_tokens = []
         for token in new_tokens:
+            assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
             if token != self.unk_token and \
                     self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
                 to_add_tokens.append(token)
@@ -325,23 +407,23 @@ class PreTrainedTokenizer(object):
 
     def add_special_tokens(self, special_tokens_dict):
         """ Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
-            to class attributes. If the special tokens are not in the vocabulary, they are added
-            to it and indexed starting from the last index of the current vocabulary.
+            to class attributes. If special tokens are NOT in the vocabulary, they are added
+            to it (indexed starting from the last index of the current vocabulary).
+
+            Parameters:
+                special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``].
+                
+                    Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
 
-            Returns:
-                Number of tokens added to the vocabulary which can be used to correspondingly
-                    increase the size of the associated model embedding matrices.
         """
         if not special_tokens_dict:
             return 0
 
-        added_special_tokens = self.add_tokens(special_tokens_dict.values())
         for key, value in special_tokens_dict.items():
+            assert key in self.SPECIAL_TOKENS_ATTRIBUTES
             logger.info("Assigning %s to the %s key of the tokenizer", value, key)
             setattr(self, key, value)
 
-        return added_special_tokens
-
 
     def tokenize(self, text, **kwargs):
         """ Converts a string in a sequence of tokens (string), using the tokenizer.
@@ -369,13 +451,13 @@ class PreTrainedTokenizer(object):
             Split in words for word-based vocabulary or sub-words for sub-word-based
             vocabularies (BPE/SentencePieces/WordPieces).
 
-            Don't take care of added tokens.
+            Do NOT take care of added tokens.
         """
         raise NotImplementedError
 
     def convert_tokens_to_ids(self, tokens):
-        """ Converts a single token or a sequence of tokens (str/unicode) in a integer id
-            (resp.) a sequence of ids, using the vocabulary.
+        """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
+            (resp. a sequence of ids), using the vocabulary.
         """
         if isinstance(tokens, str) or (six.PY2 and isinstance(tokens, unicode)):
             return self._convert_token_to_id_with_added_voc(tokens)
@@ -400,7 +482,8 @@ class PreTrainedTokenizer(object):
 
     def encode(self, text):
         """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
-            same as self.convert_tokens_to_ids(self.tokenize(text)).
+        
+        Same doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
         """
         return self.convert_tokens_to_ids(self.tokenize(text))
 
@@ -440,6 +523,8 @@ class PreTrainedTokenizer(object):
     def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
         """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
             with options to remove special tokens and clean up tokenization spaces.
+
+        Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
         """
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
         text = self.convert_tokens_to_string(filtered_tokens)
@@ -482,6 +567,8 @@ class PreTrainedTokenizer(object):
 
     @staticmethod
     def clean_up_tokenization(out_string):
+        """ Clean up a list of simple English tokenization artifacts like spaces before punctuations and abreviated forms.
+        """
         out_string = out_string.replace(' .', '.').replace(' ?', '?').replace(' !', '!').replace(' ,', ','
                         ).replace(" ' ", "'").replace(" n't", "n't").replace(" 'm", "'m").replace(" do not", " don't"
                         ).replace(" 's", "'s").replace(" 've", "'ve").replace(" 're", "'re")

From 84eb69908226ed78eceb4d6c69e83ca54c39cc21 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=B7=E6=89=93=E4=B8=8D=E5=8A=A8=EF=BC=81?=
 <779222056@qq.com>
Date: Mon, 5 Aug 2019 08:57:09 +0800
Subject: [PATCH 18/35] Update modeling_xlnet.py

---
 pytorch_transformers/modeling_xlnet.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 515decdb3e..764eba1768 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -335,7 +335,7 @@ class XLNetConfig(PretrainedConfig):
 
 try:
     from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm
-except ImportError:
+except (ImportError, AttributeError) as e:
     logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
     class XLNetLayerNorm(nn.Module):
         def __init__(self, d_model, eps=1e-12):

From d7fd10568cd7046e3d9bc53343445c225f444954 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=9B=B7=E6=89=93=E4=B8=8D=E5=8A=A8=EF=BC=81?=
 <779222056@qq.com>
Date: Mon, 5 Aug 2019 08:58:19 +0800
Subject: [PATCH 19/35] Update modeling_bert.py

---
 pytorch_transformers/modeling_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index b59445513a..3af9670cef 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -222,7 +222,7 @@ class BertConfig(PretrainedConfig):
 
 try:
     from apex.normalization.fused_layer_norm import FusedLayerNorm as BertLayerNorm
-except ImportError:
+except (ImportError, AttributeError) as e:
     logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
     class BertLayerNorm(nn.Module):
         def __init__(self, hidden_size, eps=1e-12):

From 328afb70971c2b9144a06f7e7ed9c0c7704bfe92 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 14:08:56 +0200
Subject: [PATCH 20/35] cleaning up tokenizer tests structure (at last) - last
 remaining ppb refs

---
 README.md                                     |  11 +-
 docs/source/migration.md                      |  11 +-
 docs/source/serialization.rst                 |   2 +-
 docs/source/torchscript.rst                   |   5 +-
 pytorch_transformers/__init__.py              |   2 +-
 .../convert_pytorch_checkpoint_to_tf.py       |   2 +-
 pytorch_transformers/file_utils.py            |  15 +-
 .../tests/tokenization_bert_test.py           |  35 ++--
 .../tests/tokenization_gpt2_test.py           |  55 +++---
 .../tests/tokenization_openai_test.py         |  54 +++---
 .../tests/tokenization_tests_commons.py       | 172 ++++++++++--------
 .../tests/tokenization_transfo_xl_test.py     |  37 ++--
 .../tests/tokenization_xlm_test.py            |  54 +++---
 .../tests/tokenization_xlnet_test.py          |  72 ++++----
 pytorch_transformers/tokenization_bert.py     |   2 +-
 pytorch_transformers/tokenization_utils.py    |  36 +++-
 16 files changed, 332 insertions(+), 233 deletions(-)

diff --git a/README.md b/README.md
index c31bbd24b7..c130675dbd 100644
--- a/README.md
+++ b/README.md
@@ -345,8 +345,13 @@ tokenizer = BertTokenizer.from_pretrained('./my_saved_model_directory/')
 
 ### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules
 
-The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer.
-The new optimizer `AdamW` matches PyTorch `Adam` optimizer API.
+The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer which has a few differences:
+
+- it only implements weights decay correction,
+- schedules are now externals (see below),
+- gradient clipping is now also external (see below).
+
+The new optimizer `AdamW` matches PyTorch `Adam` optimizer API and let you use standard PyTorch or apex methods for the schedule and clipping.
 
 The schedules are now standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) and not part of the optimizer anymore.
 
@@ -355,6 +360,7 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch
 ```python
 # Parameters:
 lr = 1e-3
+max_grad_norm = 1.0
 num_total_steps = 1000
 num_warmup_steps = 100
 warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1
@@ -374,6 +380,7 @@ scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_tot
 for batch in train_data:
     loss = model(batch)
     loss.backward()
+    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
     scheduler.step()
     optimizer.step()
 ```
diff --git a/docs/source/migration.md b/docs/source/migration.md
index ba09253472..9cfcaade13 100644
--- a/docs/source/migration.md
+++ b/docs/source/migration.md
@@ -68,8 +68,13 @@ tokenizer = BertTokenizer.from_pretrained('./my_saved_model_directory/')
 
 ### Optimizers: BertAdam & OpenAIAdam are now AdamW, schedules are standard PyTorch schedules
 
-The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer.
-The new optimizer `AdamW` matches PyTorch `Adam` optimizer API.
+The two optimizers previously included, `BertAdam` and `OpenAIAdam`, have been replaced by a single `AdamW` optimizer which has a few differences:
+
+- it only implements weights decay correction,
+- schedules are now externals (see below),
+- gradient clipping is now also external (see below).
+
+The new optimizer `AdamW` matches PyTorch `Adam` optimizer API and let you use standard PyTorch or apex methods for the schedule and clipping.
 
 The schedules are now standard [PyTorch learning rate schedulers](https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate) and not part of the optimizer anymore.
 
@@ -78,6 +83,7 @@ Here is a conversion examples from `BertAdam` with a linear warmup and decay sch
 ```python
 # Parameters:
 lr = 1e-3
+max_grad_norm = 1.0
 num_total_steps = 1000
 num_warmup_steps = 100
 warmup_proportion = float(num_warmup_steps) / float(num_total_steps)  # 0.1
@@ -97,6 +103,7 @@ scheduler = WarmupLinearSchedule(optimizer, warmup_steps=num_warmup_steps, t_tot
 for batch in train_data:
     loss = model(batch)
     loss.backward()
+    torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
     scheduler.step()
     optimizer.step()
 ```
diff --git a/docs/source/serialization.rst b/docs/source/serialization.rst
index 61854f61ea..7117d7ffa6 100644
--- a/docs/source/serialization.rst
+++ b/docs/source/serialization.rst
@@ -122,7 +122,7 @@ Here is the recommended way of saving the model, configuration and vocabulary to
 
 .. code-block:: python
 
-   from pytorch_pretrained_bert import WEIGHTS_NAME, CONFIG_NAME
+   from pytorch_transformers import WEIGHTS_NAME, CONFIG_NAME
 
    output_dir = "./models/"
 
diff --git a/docs/source/torchscript.rst b/docs/source/torchscript.rst
index 1b84559567..3c38177353 100644
--- a/docs/source/torchscript.rst
+++ b/docs/source/torchscript.rst
@@ -74,7 +74,7 @@ according to a ``BertConfig`` class and then saved to disk under the filename ``
 
 .. code-block:: python
 
-    from pytorch_pretrained_bert import BertModel, BertTokenizer, BertConfig
+    from pytorch_transformers import BertModel, BertTokenizer, BertConfig
     import torch
 
     enc = BertTokenizer.from_pretrained("bert-base-uncased")
@@ -105,6 +105,9 @@ according to a ``BertConfig`` class and then saved to disk under the filename ``
     # The model needs to be in evaluation mode
     model.eval()
 
+    # If you are instantiating the model with `from_pretrained` you can also easily set the TorchScript flag
+    model = BertModel.from_pretrained("bert-base-uncased", torchscript=True)
+
     # Creating the trace
     traced_model = torch.jit.trace(model, [tokens_tensor, segments_tensors])
     torch.jit.save(traced_model, "traced_bert.pt")
diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index c9b0aeebb7..72d666448e 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -39,4 +39,4 @@ from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
 from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
                            WarmupCosineWithHardRestartsSchedule, WarmupLinearSchedule)
 
-from .file_utils import (PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
+from .file_utils import (PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE, cached_path)
diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
index b8858ee3dc..d866365fd0 100644
--- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
+++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
@@ -20,7 +20,7 @@ import argparse
 import torch
 import numpy as np
 import tensorflow as tf
-from pytorch_pretrained_bert.modeling import BertModel
+from pytorch_transformers.modeling import BertModel
 
 
 def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:str):
diff --git a/pytorch_transformers/file_utils.py b/pytorch_transformers/file_utils.py
index fd655cec0e..75c075720c 100644
--- a/pytorch_transformers/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -38,10 +38,13 @@ except ImportError:
 try:
     from pathlib import Path
     PYTORCH_PRETRAINED_BERT_CACHE = Path(
-        os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path))
+        os.getenv('PYTORCH_TRANSFORMERS_CACHE', os.getenv('PYTORCH_PRETRAINED_BERT_CACHE', default_cache_path)))
 except (AttributeError, ImportError):
-    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
-                                              default_cache_path)
+    PYTORCH_PRETRAINED_BERT_CACHE = os.getenv('PYTORCH_TRANSFORMERS_CACHE',
+                                              os.getenv('PYTORCH_PRETRAINED_BERT_CACHE',
+                                                        default_cache_path))
+
+PYTORCH_TRANSFORMERS_CACHE = PYTORCH_PRETRAINED_BERT_CACHE  # Kept for backward compatibility
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
@@ -70,7 +73,7 @@ def filename_to_url(filename, cache_dir=None):
     Raise ``EnvironmentError`` if `filename` or its stored metadata do not exist.
     """
     if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+        cache_dir = PYTORCH_TRANSFORMERS_CACHE
     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
         cache_dir = str(cache_dir)
 
@@ -98,7 +101,7 @@ def cached_path(url_or_filename, cache_dir=None):
     make sure the file exists and then return the path.
     """
     if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+        cache_dir = PYTORCH_TRANSFORMERS_CACHE
     if sys.version_info[0] == 3 and isinstance(url_or_filename, Path):
         url_or_filename = str(url_or_filename)
     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
@@ -187,7 +190,7 @@ def get_from_cache(url, cache_dir=None):
     If it's not there, download it. Then return the path to the cached file.
     """
     if cache_dir is None:
-        cache_dir = PYTORCH_PRETRAINED_BERT_CACHE
+        cache_dir = PYTORCH_TRANSFORMERS_CACHE
     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
         cache_dir = str(cache_dir)
     if sys.version_info[0] == 2 and not isinstance(cache_dir, str):
diff --git a/pytorch_transformers/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py
index 0b9cfb1b32..5eb39b729d 100644
--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -24,30 +24,37 @@ from pytorch_transformers.tokenization_bert import (BasicTokenizer,
                                                     _is_control, _is_punctuation,
                                                     _is_whitespace, VOCAB_FILES_NAMES)
 
-from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from .tokenization_tests_commons import CommonTestCases
 
-class TokenizationTest(unittest.TestCase):
+class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = BertTokenizer
+
+    def setUp(self):
+        super(BertTokenizationTest, self).setUp()
 
-    def test_full_tokenizer(self):
         vocab_tokens = [
             "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
             "##ing", ",", "low", "lowest",
         ]
-        with TemporaryDirectory() as tmpdirname:
-            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-            with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
-                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-            input_text = u"UNwant\u00E9d,running"
-            output_text = u"unwanted, running"
+    def get_tokenizer(self):
+        return BertTokenizer.from_pretrained(self.tmpdirname)
 
-            create_and_check_tokenizer_commons(self, input_text, output_text, BertTokenizer, tmpdirname)
+    def get_input_output_texts(self):
+        input_text = u"UNwant\u00E9d,running"
+        output_text = u"unwanted, running"
+        return input_text, output_text
 
-            tokenizer = BertTokenizer(vocab_file)
+    def test_full_tokenizer(self):
+        tokenizer = BertTokenizer(self.vocab_file)
 
-            tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
-            self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
-            self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
+        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
 
     def test_chinese(self):
         tokenizer = BasicTokenizer()
diff --git a/pytorch_transformers/tests/tokenization_gpt2_test.py b/pytorch_transformers/tests/tokenization_gpt2_test.py
index 8dae72ec99..da7028c27d 100644
--- a/pytorch_transformers/tests/tokenization_gpt2_test.py
+++ b/pytorch_transformers/tests/tokenization_gpt2_test.py
@@ -20,42 +20,49 @@ import json
 
 from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
 
-from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from .tokenization_tests_commons import CommonTestCases
 
-class GPT2TokenizationTest(unittest.TestCase):
+class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
 
-    def test_full_tokenizer(self):
-        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+    tokenizer_class = GPT2Tokenizer
+
+    def setUp(self):
+        super(GPT2TokenizationTest, self).setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                  "lo", "low", "er",
                  "low", "lowest", "newer", "wider", "<unk>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
-        special_tokens_map = {"unk_token": "<unk>"}
+        self.special_tokens_map = {"unk_token": "<unk>"}
 
-        with TemporaryDirectory() as tmpdirname:
-            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
-            with open(vocab_file, "w") as fp:
-                fp.write(json.dumps(vocab_tokens))
-            with open(merges_file, "w") as fp:
-                fp.write("\n".join(merges))
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        with open(self.vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
+            fp.write("\n".join(merges))
 
-            input_text = u"lower newer"
-            output_text = u"lower<unk>newer"
+    def get_tokenizer(self):
+        return GPT2Tokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
 
-            create_and_check_tokenizer_commons(self, input_text, output_text, GPT2Tokenizer, tmpdirname, **special_tokens_map)
+    def get_input_output_texts(self):
+        input_text = u"lower newer"
+        output_text = u"lower<unk>newer"
+        return input_text, output_text
 
-            tokenizer = GPT2Tokenizer(vocab_file, merges_file, **special_tokens_map)
-            text = "lower"
-            bpe_tokens = ["low", "er"]
-            tokens = tokenizer.tokenize(text)
-            self.assertListEqual(tokens, bpe_tokens)
+    def test_full_tokenizer(self):
+        tokenizer = GPT2Tokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
+        text = "lower"
+        bpe_tokens = ["low", "er"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
 
-            input_tokens = tokens + [tokenizer.unk_token]
-            input_bpe_tokens = [13, 12, 17]
-            self.assertListEqual(
-                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [13, 12, 17]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
 
 if __name__ == '__main__':
diff --git a/pytorch_transformers/tests/tokenization_openai_test.py b/pytorch_transformers/tests/tokenization_openai_test.py
index 9b4841a605..bb354f3fb7 100644
--- a/pytorch_transformers/tests/tokenization_openai_test.py
+++ b/pytorch_transformers/tests/tokenization_openai_test.py
@@ -20,13 +20,17 @@ import json
 
 from pytorch_transformers.tokenization_openai import OpenAIGPTTokenizer, VOCAB_FILES_NAMES
 
-from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from .tokenization_tests_commons import CommonTestCases
 
 
-class OpenAIGPTTokenizationTest(unittest.TestCase):
+class OpenAIGPTTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
-    def test_full_tokenizer(self):
-        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+    tokenizer_class = OpenAIGPTTokenizer
+
+    def setUp(self):
+        super(OpenAIGPTTokenizationTest, self).setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                  "w</w>", "r</w>", "t</w>",
                  "lo", "low", "er</w>",
@@ -34,30 +38,34 @@ class OpenAIGPTTokenizationTest(unittest.TestCase):
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["#version: 0.2", "l o", "lo w", "e r</w>", ""]
 
-        with TemporaryDirectory() as tmpdirname:
-            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
-            with open(vocab_file, "w") as fp:
-                fp.write(json.dumps(vocab_tokens))
-            with open(merges_file, "w") as fp:
-                fp.write("\n".join(merges))
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        with open(self.vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
+            fp.write("\n".join(merges))
 
-            input_text = u"lower newer"
-            output_text = u"lower newer"
+    def get_tokenizer(self):
+        return OpenAIGPTTokenizer.from_pretrained(self.tmpdirname)
 
-            create_and_check_tokenizer_commons(self, input_text, output_text, OpenAIGPTTokenizer, tmpdirname)
+    def get_input_output_texts(self):
+        input_text = u"lower newer"
+        output_text = u"lower newer"
+        return input_text, output_text
 
-            tokenizer = OpenAIGPTTokenizer(vocab_file, merges_file)
 
-            text = "lower"
-            bpe_tokens = ["low", "er</w>"]
-            tokens = tokenizer.tokenize(text)
-            self.assertListEqual(tokens, bpe_tokens)
+    def test_full_tokenizer(self):
+        tokenizer = OpenAIGPTTokenizer(self.vocab_file, self.merges_file)
 
-            input_tokens = tokens + ["<unk>"]
-            input_bpe_tokens = [14, 15, 20]
-            self.assertListEqual(
-                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
+
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [14, 15, 20]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
 
 if __name__ == '__main__':
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index c37770b229..ebcf6f48d8 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -19,6 +19,7 @@ import sys
 from io import open
 import tempfile
 import shutil
+import unittest
 
 if sys.version_info[0] == 2:
     import cPickle as pickle
@@ -36,113 +37,124 @@ else:
     unicode = str
 
 
-def create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
-    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
+class CommonTestCases:
 
-    before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+    class CommonTokenizerTester(unittest.TestCase):
 
-    with TemporaryDirectory() as tmpdirname:
-        tokenizer.save_pretrained(tmpdirname)
-        tokenizer = tokenizer.from_pretrained(tmpdirname)
+        tokenizer_class = None
 
-    after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
-    tester.assertListEqual(before_tokens, after_tokens)
+        def setUp(self):
+            self.tmpdirname = tempfile.mkdtemp()
 
-def create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
-    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
-    tester.assertIsNotNone(tokenizer)
+        def tearDown(self):
+            shutil.rmtree(self.tmpdirname)
 
-    text = u"Munich and Berlin are nice cities"
-    subwords = tokenizer.tokenize(text)
+        def get_tokenizer(self):
+            raise NotImplementedError
 
-    with TemporaryDirectory() as tmpdirname:
+        def get_input_output_texts(self):
+            raise NotImplementedError
 
-        filename = os.path.join(tmpdirname, u"tokenizer.bin")
-        pickle.dump(tokenizer, open(filename, "wb"))
+        def test_save_and_load_tokenizer(self):
+            tokenizer = self.get_tokenizer()
 
-        tokenizer_new = pickle.load(open(filename, "rb"))
+            before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
 
-    subwords_loaded = tokenizer_new.tokenize(text)
+            with TemporaryDirectory() as tmpdirname:
+                tokenizer.save_pretrained(tmpdirname)
+                tokenizer = tokenizer.from_pretrained(tmpdirname)
 
-    tester.assertListEqual(subwords, subwords_loaded)
+            after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+            self.assertListEqual(before_tokens, after_tokens)
+
+        def test_pickle_tokenizer(self):
+            tokenizer = self.get_tokenizer()
+            self.assertIsNotNone(tokenizer)
+
+            text = u"Munich and Berlin are nice cities"
+            subwords = tokenizer.tokenize(text)
+
+            with TemporaryDirectory() as tmpdirname:
+
+                filename = os.path.join(tmpdirname, u"tokenizer.bin")
+                pickle.dump(tokenizer, open(filename, "wb"))
+
+                tokenizer_new = pickle.load(open(filename, "rb"))
+
+            subwords_loaded = tokenizer_new.tokenize(text)
+
+            self.assertListEqual(subwords, subwords_loaded)
 
 
-def create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs):
-    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
+        def test_add_tokens_tokenizer(self):
+            tokenizer = self.get_tokenizer()
 
-    vocab_size = tokenizer.vocab_size
-    all_size = len(tokenizer)
+            vocab_size = tokenizer.vocab_size
+            all_size = len(tokenizer)
 
-    tester.assertNotEqual(vocab_size, 0)
-    tester.assertEqual(vocab_size, all_size)
+            self.assertNotEqual(vocab_size, 0)
+            self.assertEqual(vocab_size, all_size)
 
-    new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
-    added_toks = tokenizer.add_tokens(new_toks)
-    vocab_size_2 = tokenizer.vocab_size
-    all_size_2 = len(tokenizer)
+            new_toks = ["aaaaabbbbbb", "cccccccccdddddddd"]
+            added_toks = tokenizer.add_tokens(new_toks)
+            vocab_size_2 = tokenizer.vocab_size
+            all_size_2 = len(tokenizer)
 
-    tester.assertNotEqual(vocab_size_2, 0)
-    tester.assertEqual(vocab_size, vocab_size_2)
-    tester.assertEqual(added_toks, len(new_toks))
-    tester.assertEqual(all_size_2, all_size + len(new_toks))
+            self.assertNotEqual(vocab_size_2, 0)
+            self.assertEqual(vocab_size, vocab_size_2)
+            self.assertEqual(added_toks, len(new_toks))
+            self.assertEqual(all_size_2, all_size + len(new_toks))
 
-    tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
-    tester.assertGreaterEqual(len(tokens), 4)
-    tester.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-    tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+            tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
+            self.assertGreaterEqual(len(tokens), 4)
+            self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+            self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
 
-    new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
-                  'pad_token': "<<<<<|||>|>>>>|>"}
-    added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
-    vocab_size_3 = tokenizer.vocab_size
-    all_size_3 = len(tokenizer)
+            new_toks_2 = {'eos_token': ">>>>|||<||<<|<<",
+                        'pad_token': "<<<<<|||>|>>>>|>"}
+            added_toks_2 = tokenizer.add_special_tokens(new_toks_2)
+            vocab_size_3 = tokenizer.vocab_size
+            all_size_3 = len(tokenizer)
 
-    tester.assertNotEqual(vocab_size_3, 0)
-    tester.assertEqual(vocab_size, vocab_size_3)
-    tester.assertEqual(added_toks_2, len(new_toks_2))
-    tester.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
+            self.assertNotEqual(vocab_size_3, 0)
+            self.assertEqual(vocab_size, vocab_size_3)
+            self.assertEqual(added_toks_2, len(new_toks_2))
+            self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
 
-    tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
+            tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
 
-    tester.assertGreaterEqual(len(tokens), 6)
-    tester.assertGreater(tokens[0], tokenizer.vocab_size - 1)
-    tester.assertGreater(tokens[0], tokens[1])
-    tester.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
-    tester.assertGreater(tokens[-2], tokens[-3])
-    tester.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
-    tester.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
+            self.assertGreaterEqual(len(tokens), 6)
+            self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
+            self.assertGreater(tokens[0], tokens[1])
+            self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
+            self.assertGreater(tokens[-2], tokens[-3])
+            self.assertEqual(tokens[0], tokenizer.convert_tokens_to_ids(tokenizer.eos_token))
+            self.assertEqual(tokens[-2], tokenizer.convert_tokens_to_ids(tokenizer.pad_token))
 
 
-def create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
-    tokenizer = tokenizer_class.from_pretrained(*inputs, **kwargs)
+        def test_required_methods_tokenizer(self):
+            tokenizer = self.get_tokenizer()
+            input_text, output_text = self.get_input_output_texts()
 
-    tokens = tokenizer.tokenize(input_text)
-    ids = tokenizer.convert_tokens_to_ids(tokens)
-    ids_2 = tokenizer.encode(input_text)
-    tester.assertListEqual(ids, ids_2)
+            tokens = tokenizer.tokenize(input_text)
+            ids = tokenizer.convert_tokens_to_ids(tokens)
+            ids_2 = tokenizer.encode(input_text)
+            self.assertListEqual(ids, ids_2)
 
-    tokens_2 = tokenizer.convert_ids_to_tokens(ids)
-    text_2 = tokenizer.decode(ids)
+            tokens_2 = tokenizer.convert_ids_to_tokens(ids)
+            text_2 = tokenizer.decode(ids)
 
-    tester.assertEqual(text_2, output_text)
+            self.assertEqual(text_2, output_text)
 
-    tester.assertNotEqual(len(tokens_2), 0)
-    tester.assertIsInstance(text_2, (str, unicode))
+            self.assertNotEqual(len(tokens_2), 0)
+            self.assertIsInstance(text_2, (str, unicode))
 
 
-def create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
-    weights_list = list(tokenizer_class.max_model_input_sizes.keys())
-    weights_lists_2 = []
-    for file_id, map_list in tokenizer_class.pretrained_vocab_files_map.items():
-        weights_lists_2.append(list(map_list.keys()))
+        def test_pretrained_model_lists(self):
+            weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())
+            weights_lists_2 = []
+            for file_id, map_list in self.tokenizer_class.pretrained_vocab_files_map.items():
+                weights_lists_2.append(list(map_list.keys()))
 
-    for weights_list_2 in weights_lists_2:
-        tester.assertListEqual(weights_list, weights_list_2)
-
-
-def create_and_check_tokenizer_commons(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs):
-    create_and_check_pretrained_model_lists(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
-    create_and_check_required_methods_tokenizer(tester, input_text, output_text, tokenizer_class, *inputs, **kwargs)
-    create_and_check_add_tokens_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
-    create_and_check_save_and_load_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
-    create_and_check_pickle_tokenizer(tester, tokenizer_class, *inputs, **kwargs)
+            for weights_list_2 in weights_lists_2:
+                self.assertListEqual(weights_list, weights_list_2)
diff --git a/pytorch_transformers/tests/tokenization_transfo_xl_test.py b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
index aecfeaef5f..fbd06cf47e 100644
--- a/pytorch_transformers/tests/tokenization_transfo_xl_test.py
+++ b/pytorch_transformers/tests/tokenization_transfo_xl_test.py
@@ -20,32 +20,39 @@ from io import open
 
 from pytorch_transformers.tokenization_transfo_xl import TransfoXLTokenizer, VOCAB_FILES_NAMES
 
-from.tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from.tokenization_tests_commons import CommonTestCases
 
-class TransfoXLTokenizationTest(unittest.TestCase):
+class TransfoXLTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = TransfoXLTokenizer
+
+    def setUp(self):
+        super(TransfoXLTokenizationTest, self).setUp()
 
-    def test_full_tokenizer(self):
         vocab_tokens = [
             "<unk>", "[CLS]", "[SEP]", "want", "unwanted", "wa", "un",
             "running", ",", "low", "l",
         ]
-        with TemporaryDirectory() as tmpdirname:
-            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-            with open(vocab_file, "w", encoding='utf-8') as vocab_writer:
-                vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
 
-            input_text = u"<unk> UNwanted , running"
-            output_text = u"<unk> unwanted, running"
+    def get_tokenizer(self):
+        return TransfoXLTokenizer.from_pretrained(self.tmpdirname, lower_case=True)
 
-            create_and_check_tokenizer_commons(self, input_text, output_text, TransfoXLTokenizer, tmpdirname, lower_case=True)
+    def get_input_output_texts(self):
+        input_text = u"<unk> UNwanted , running"
+        output_text = u"<unk> unwanted, running"
+        return input_text, output_text
 
-            tokenizer = TransfoXLTokenizer(vocab_file=vocab_file, lower_case=True)
+    def test_full_tokenizer(self):
+        tokenizer = TransfoXLTokenizer(vocab_file=self.vocab_file, lower_case=True)
 
-            tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
-            self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
+        tokens = tokenizer.tokenize(u"<unk> UNwanted , running")
+        self.assertListEqual(tokens, ["<unk>", "unwanted", ",", "running"])
 
-            self.assertListEqual(
-                tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [0, 4, 8, 7])
 
     def test_full_tokenizer_lower(self):
         tokenizer = TransfoXLTokenizer(lower_case=True)
diff --git a/pytorch_transformers/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py
index 97e8fa983f..a20e92044f 100644
--- a/pytorch_transformers/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -20,12 +20,16 @@ import json
 
 from pytorch_transformers.tokenization_xlm import XLMTokenizer, VOCAB_FILES_NAMES
 
-from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from .tokenization_tests_commons import CommonTestCases
 
-class XLMTokenizationTest(unittest.TestCase):
+class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
-    def test_full_tokenizer(self):
-        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+    tokenizer_class = XLMTokenizer
+
+    def setUp(self):
+        super(XLMTokenizationTest, self).setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                  "w</w>", "r</w>", "t</w>",
                  "lo", "low", "er</w>",
@@ -33,30 +37,34 @@ class XLMTokenizationTest(unittest.TestCase):
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
         merges = ["l o 123", "lo w 1456", "e r</w> 1789", ""]
 
-        with TemporaryDirectory() as tmpdirname:
-            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-            merges_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['merges_file'])
-            with open(vocab_file, "w") as fp:
-                fp.write(json.dumps(vocab_tokens))
-            with open(merges_file, "w") as fp:
-                fp.write("\n".join(merges))
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        with open(self.vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
+            fp.write("\n".join(merges))
 
-            input_text = u"lower newer"
-            output_text = u"lower newer"
+    def get_tokenizer(self):
+        return XLMTokenizer.from_pretrained(self.tmpdirname)
 
-            create_and_check_tokenizer_commons(self, input_text, output_text, XLMTokenizer, tmpdirname)
+    def get_input_output_texts(self):
+        input_text = u"lower newer"
+        output_text = u"lower newer"
+        return input_text, output_text
 
-            tokenizer = XLMTokenizer(vocab_file, merges_file)
+    def test_full_tokenizer(self):
+        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+        tokenizer = XLMTokenizer(self.vocab_file, self.merges_file)
 
-            text = "lower"
-            bpe_tokens = ["low", "er</w>"]
-            tokens = tokenizer.tokenize(text)
-            self.assertListEqual(tokens, bpe_tokens)
+        text = "lower"
+        bpe_tokens = ["low", "er</w>"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
 
-            input_tokens = tokens + ["<unk>"]
-            input_bpe_tokens = [14, 15, 20]
-            self.assertListEqual(
-                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        input_tokens = tokens + ["<unk>"]
+        input_bpe_tokens = [14, 15, 20]
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
 
 if __name__ == '__main__':
diff --git a/pytorch_transformers/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py
index 27c6b984ee..08e9e9cb2d 100644
--- a/pytorch_transformers/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -19,48 +19,58 @@ import unittest
 
 from pytorch_transformers.tokenization_xlnet import (XLNetTokenizer, SPIECE_UNDERLINE)
 
-from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from .tokenization_tests_commons import CommonTestCases
 
 SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                     'fixtures/test_sentencepiece.model')
 
-class XLNetTokenizationTest(unittest.TestCase):
+class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = XLNetTokenizer
+
+    def setUp(self):
+        super(XLNetTokenizationTest, self).setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self):
+        return XLNetTokenizer.from_pretrained(self.tmpdirname)
+
+    def get_input_output_texts(self):
+        input_text = u"This is a test"
+        output_text = u"This is a test"
+        return input_text, output_text
+
 
     def test_full_tokenizer(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, keep_accents=True)
 
-        with TemporaryDirectory() as tmpdirname:
-            tokenizer.save_pretrained(tmpdirname)
+        tokens = tokenizer.tokenize(u'This is a test')
+        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
 
-            input_text = u"This is a test"
-            output_text = u"This is a test"
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
 
-            create_and_check_tokenizer_commons(self, input_text, output_text, XLNetTokenizer, tmpdirname)
+        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                    u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
+                                    u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                    SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids, [8, 21, 84, 55, 24, 19, 7, 0,
+                602, 347, 347, 347, 3, 12, 66,
+                46, 72, 80, 6, 0, 4])
 
-            tokens = tokenizer.tokenize(u'This is a test')
-            self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
-
-            self.assertListEqual(
-                tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
-
-            tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
-            self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                        u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
-                                        u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                        SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
-            ids = tokenizer.convert_tokens_to_ids(tokens)
-            self.assertListEqual(
-                ids, [8, 21, 84, 55, 24, 19, 7, 0,
-                    602, 347, 347, 347, 3, 12, 66,
-                    46, 72, 80, 6, 0, 4])
-
-            back_tokens = tokenizer.convert_ids_to_tokens(ids)
-            self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
-                                            u'or', u'n', SPIECE_UNDERLINE + u'in',
-                                            SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
-                                            SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
-                                            SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
-                                            u'<unk>', u'.'])
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                        u'or', u'n', SPIECE_UNDERLINE + u'in',
+                                        SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
+                                        SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                        SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
+                                        u'<unk>', u'.'])
 
     def test_tokenizer_lower(self):
         tokenizer = XLNetTokenizer(SAMPLE_VOCAB, do_lower_case=True)
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index d7aeff7c39..9bf18a97d7 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -86,7 +86,7 @@ def whitespace_tokenize(text):
 class BertTokenizer(PreTrainedTokenizer):
     r"""
     Constructs a BertTokenizer.
-    :class:`~pytorch_pretrained_bert.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
+    :class:`~pytorch_transformers.BertTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
 
     Args:
         vocab_file: Path to a one-wordpiece-per-line vocabulary file
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 1852d74021..a81a5b9235 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -125,42 +125,34 @@ class PreTrainedTokenizer(object):
 
     @bos_token.setter
     def bos_token(self, value):
-        self.add_tokens([value]) 
         self._bos_token = value
 
     @eos_token.setter
     def eos_token(self, value):
-        self.add_tokens([value]) 
         self._eos_token = value
 
     @unk_token.setter
     def unk_token(self, value):
-        self.add_tokens([value]) 
         self._unk_token = value
 
     @sep_token.setter
     def sep_token(self, value):
-        self.add_tokens([value]) 
         self._sep_token = value
 
     @pad_token.setter
     def pad_token(self, value):
-        self.add_tokens([value]) 
         self._pad_token = value
 
     @cls_token.setter
     def cls_token(self, value):
-        self.add_tokens([value]) 
         self._cls_token = value
 
     @mask_token.setter
     def mask_token(self, value):
-        self.add_tokens([value]) 
         self._mask_token = value
 
     @additional_special_tokens.setter
     def additional_special_tokens(self, value):
-        self.add_tokens(value) 
         self._additional_special_tokens = value
 
     def __init__(self, max_len=None, **kwargs):
@@ -179,6 +171,10 @@ class PreTrainedTokenizer(object):
 
         for key, value in kwargs.items():
             if key in self.SPECIAL_TOKENS_ATTRIBUTES:
+                if key == 'additional_special_tokens':
+                    assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
+                else:
+                    assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
                 setattr(self, key, value)
 
 
@@ -415,15 +411,39 @@ class PreTrainedTokenizer(object):
                 
                     Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
 
+            Returns:
+                Number of tokens added to the vocabulary.
+
+        Examples::
+
+            # Let's see how to add a new classification token to GPT-2
+            tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+            model = GPT2Model.from_pretrained('gpt2')
+
+            special_tokens_dict = {'cls_token': '<CLS>'}
+
+            num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
+            print('We have added', num_added_toks, 'tokens')
+            model.resize_token_embeddings(len(tokenizer))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
+
+            assert tokenizer.cls_token == '<CLS>'
         """
         if not special_tokens_dict:
             return 0
 
+        added_tokens = 0
         for key, value in special_tokens_dict.items():
             assert key in self.SPECIAL_TOKENS_ATTRIBUTES
+            if key == 'additional_special_tokens':
+                assert isinstance(value, (list, tuple)) and all(isinstance(t, str) or (six.PY2 and isinstance(t, unicode)) for t in value)
+                added_tokens += self.add_tokens(value)
+            else:
+                assert isinstance(value, str) or (six.PY2 and isinstance(value, unicode))
+                added_tokens += self.add_tokens([value])
             logger.info("Assigning %s to the %s key of the tokenizer", value, key)
             setattr(self, key, value)
 
+        return added_tokens
 
     def tokenize(self, text, **kwargs):
         """ Converts a string in a sequence of tokens (string), using the tokenizer.

From 58830807d1ce9788da1049f2faab4149b3cab683 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 14:38:59 +0200
Subject: [PATCH 21/35] inidicate we only support pytorch 1.0.0+ now

---
 README.md        | 2 +-
 requirements.txt | 2 +-
 setup.py         | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index c130675dbd..7fdef9dcb8 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ These implementations have been tested on several datasets (see the example scri
 
 ## Installation
 
-This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 0.4.1 to 1.1.0
+This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.0.0+
 
 ### With pip
 
diff --git a/requirements.txt b/requirements.txt
index 165fa74af9..76532d18a5 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # PyTorch
-torch>=0.4.1
+torch>=1.0.0
 # progress bars in model download and training scripts
 tqdm
 # Accessing files from S3 directly.
diff --git a/setup.py b/setup.py
index 514499481a..4c23714980 100644
--- a/setup.py
+++ b/setup.py
@@ -49,7 +49,7 @@ setup(
     url="https://github.com/huggingface/pytorch-transformers",
     packages=find_packages(exclude=["*.tests", "*.tests.*",
                                     "tests.*", "tests"]),
-    install_requires=['torch>=0.4.1',
+    install_requires=['torch>=1.0.0',
                       'numpy',
                       'boto3',
                       'requests',

From b90e29d52cfe94b1995cc5254f700e776b866d2d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 16:06:34 +0200
Subject: [PATCH 22/35] working on automodels

---
 docs/source/model_doc/auto.rst          |  26 +++
 examples/run_squad.py                   |   2 +-
 pytorch_transformers/modeling_auto.py   | 249 ++++++++++++++++++++++++
 pytorch_transformers/modeling_gpt2.py   |   2 +-
 pytorch_transformers/modeling_openai.py |   2 +-
 pytorch_transformers/modeling_utils.py  |  22 +--
 6 files changed, 289 insertions(+), 14 deletions(-)
 create mode 100644 docs/source/model_doc/auto.rst

diff --git a/docs/source/model_doc/auto.rst b/docs/source/model_doc/auto.rst
new file mode 100644
index 0000000000..ad439fff03
--- /dev/null
+++ b/docs/source/model_doc/auto.rst
@@ -0,0 +1,26 @@
+AutoModel, AutoConfig and AutoTokenizer - Standard derived classes
+---------------------------------------------------------------
+
+In many case, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method.
+
+Auto classes are here to do this job for you so that you automatically retreive the relevant model given the name/path to the pretrained weights/config/vocabulary.
+
+``AutoConfig``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.AutoConfig
+    :members:
+
+
+``AutoModel``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.AutoModel
+    :members:
+
+
+``AutoTokenizer``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.AutoTokenizer
+    :members:
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 36e03fb012..937aa154ea 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -134,7 +134,7 @@ def train(args, train_dataset, model, tokenizer):
                       'end_positions':   batch[4]}
             if args.model_type in ['xlnet', 'xlm']:
                 inputs.update({'cls_index': batch[5],
-                               'p_mask':    batch[6]})
+                               'p_mask':       batch[6]})
             outputs = model(**inputs)
             loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
 
diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index 3e28fbd0a9..7d3ea7ec60 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -18,6 +18,11 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss, MSELoss
+from torch.nn.parameter import Parameter
+
 from .modeling_bert import BertConfig, BertModel
 from .modeling_openai import OpenAIGPTConfig, OpenAIGPTModel
 from .modeling_gpt2 import GPT2Config, GPT2Model
@@ -25,6 +30,8 @@ from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
 from .modeling_xlnet import XLNetConfig, XLNetModel
 from .modeling_xlm import XLMConfig, XLMModel
 
+from .modeling_utils import PreTrainedModel, SequenceSummary
+
 logger = logging.getLogger(__name__)
 
 class AutoConfig(object):
@@ -228,3 +235,245 @@ class AutoModel(object):
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
                          "'xlm'".format(pretrained_model_name_or_path))
 
+
+class DerivedAutoModel(PreTrainedModel):
+    r"""
+        :class:`~pytorch_transformers.DerivedAutoModel` is a base class for building
+        standardized derived models on top of :class:`~pytorch_transformers.AutoModel` by adding heads
+
+        The `from_pretrained()` method take care of using the correct base model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+
+        This class should usually not be instantiated using `__init__()` but `from_pretrained()`.
+    """
+    config_class = None
+    pretrained_model_archive_map = {}
+    load_tf_weights = lambda model, config, path: None
+    base_model_prefix = "transformer"
+
+    def __init__(self, base_model):
+        super(DerivedAutoModel, self).__init__(base_model.config)
+        self.transformer = base_model
+
+    def init_weights(self, module):
+        """ Initialize the weights. Use the base model initialization function.
+        """
+        self.transformer.init_weights(module)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiate a :class:`~pytorch_transformers.DerivedAutoModel` with one of the base model classes of the library
+        from a pre-trained model configuration.
+
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+
+            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+            To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            **pretrained_model_name_or_path**: either:
+                - a string with the `shortcut name` of a pre-trained model to load from cache
+                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
+                - a path to a `directory` containing a configuration file saved
+                    using the `save_pretrained(save_directory)` method.
+                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
+                    In this case, ``from_tf`` should be set to True and a configuration object should be
+                    provided as `config` argument. This loading option is slower than converting the TensorFlow
+                    checkpoint in a PyTorch model using the provided conversion scripts and loading
+                    the PyTorch model afterwards.
+            **model_args**: (`optional`) Sequence:
+                All remaning positional arguments will be passed to the underlying model's __init__ function
+            **config**: an optional configuration for the model to use instead of an automatically loaded configuation.
+                Configuration can be automatically loaded when:
+                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
+                - the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
+            **state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
+                from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
+                a simpler option.
+            **cache_dir**: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+            **output_loading_info**: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+            **kwargs**: (`optional`) dict:
+                Dictionary of key, values to update the configuration object after loading.
+                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
+
+               - If a configuration is provided with `config`, **kwargs will be directly passed
+                 to the underlying model's __init__ method.
+               - If a configuration is not provided, **kwargs will be first passed to the pretrained
+                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
+                 Each key of **kwargs that corresponds to a configuration attribute
+                 will be used to override said attribute with the supplied **kwargs value.
+                 Remaining keys that do not correspond to any configuration attribute will
+                 be passed to the underlying model's __init__ function.
+
+        Examples::
+
+            model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        if 'bert' in pretrained_model_name_or_path:
+            base_model_class = BertModel
+        elif 'openai-gpt' in pretrained_model_name_or_path:
+            base_model_class = OpenAIGPTModel
+        elif 'gpt2' in pretrained_model_name_or_path:
+            base_model_class = GPT2Model
+        elif 'transfo-xl' in pretrained_model_name_or_path:
+            base_model_class = TransfoXLModel
+        elif 'xlnet' in pretrained_model_name_or_path:
+            base_model_class = XLNetModel
+        elif 'xlm' in pretrained_model_name_or_path:
+            base_model_class = XLMModel
+        else:
+            raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                            "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                            "'xlm'".format(pretrained_model_name_or_path))
+
+        # Get a pretrained base_model
+        base_model = base_model_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+
+        # Create our derived model
+        model = cls(base_model)
+
+        # Setup class attribute from the base model class
+        model.config_class = base_model.config_class
+        model.pretrained_model_archive_map = base_model.pretrained_model_archive_map
+        model.load_tf_weights = base_model.load_tf_weights
+
+        return model
+
+
+class AutoModelWithLMHead(DerivedAutoModel):
+    r"""
+        :class:`~pytorch_transformers.AutoModelWithLMHead` is a base class for language modeling
+        that contains
+        
+            - a base model instantiated as one of the base model classes of the library when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)` class method, and .
+            - a language modeling head on top of the base model.
+
+        The `from_pretrained()` method take care of using the correct base model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+
+        This class should usually not be instantiated using `__init__()` but `from_pretrained()`.
+    """
+
+    def __init__(self, base_model):
+        super(AutoModelWithLMHead, self).__init__(base_model)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.apply(self.init_weights)
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        # get input embeddings - whatever the model is
+        input_embeddings = self.transformer.resize_token_embeddings(new_num_tokens=None)
+
+        # tie of clone (torchscript) embeddings
+        self._tie_or_clone_weights(self.lm_head, input_embeddings)
+
+    def forward(self, input_ids, **kwargs):
+        labels = kwargs.pop('labels', None)  # Python 2 compatibility...
+
+        transformer_outputs = self.transformer(input_ids, **kwargs)
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        outputs = (lm_logits,) + transformer_outputs[1:]
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)),
+                            labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
+
+
+class AutoModelForSequenceClassification(DerivedAutoModel):
+    r"""
+        :class:`~pytorch_transformers.AutoModelForSequenceClassification` is a class for sequence classification
+        that contains
+        
+            - a base model instantiated as one of the base model classes of the library when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)` class method, and .
+            - a classification head on top of the base model.
+
+        The `from_pretrained()` method take care of using the correct base model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `bert`: BertConfig (Bert model)
+            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
+            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
+            - contains `xlnet`: XLNetConfig (XLNet model)
+            - contains `xlm`: XLMConfig (XLM model)
+
+        This class should usually not be instantiated using `__init__()` but `from_pretrained()`.
+    """
+
+    def __init__(self, base_model):
+        super(AutoModelForSequenceClassification, self).__init__(base_model)
+        self.num_labels = base_model.config.num_labels
+        self.sequence_summary = SequenceSummary(base_model.config)
+
+        self.apply(self.init_weights)
+
+    def forward(self, input_ids, cls_index, **kwargs):
+        labels = kwargs.pop('labels', None)  # Python 2 compatibility...
+
+        transformer_outputs = self.transformer(input_ids, **kwargs)
+
+        output = transformer_outputs[0]
+        logits = self.sequence_summary(output, cls_index=cls_index)
+
+        outputs = (logits,) + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
+
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 4341f0d8a1..5268c5de7d 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -137,7 +137,7 @@ class GPT2Config(PretrainedConfig):
         initializer_range=0.02,
 
         num_labels=1,
-        summary_type='token_ids',
+        summary_type='cls_index',
         summary_use_proj=True,
         summary_activation=None,
         summary_proj_to_labels=True,
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index a6cb6212ef..c51023444d 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -171,7 +171,7 @@ class OpenAIGPTConfig(PretrainedConfig):
         predict_special_tokens=True,
 
         num_labels=1,
-        summary_type='token_ids',
+        summary_type='cls_index',
         summary_use_proj=True,
         summary_activation=None,
         summary_proj_to_labels=True,
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 8970cd56f8..2664c542e0 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -765,7 +765,7 @@ class SequenceSummary(nn.Module):
                 - 'last' => [default] take the last token hidden state (like XLNet)
                 - 'first' => take the first token hidden state (like Bert)
                 - 'mean' => take the mean of all tokens hidden states
-                - 'token_ids' => supply a Tensor of classification token indices (GPT/GPT-2)
+                - 'cls_index' => supply a Tensor of classification token position (GPT/GPT-2)
                 - 'attn' => Not implemented now, use multi-head attention
             summary_use_proj: Add a projection after the vector extraction
             summary_proj_to_labels: If True, the projection outputs to config.num_labels classes (otherwise to hidden_size). Default: False.
@@ -803,11 +803,11 @@ class SequenceSummary(nn.Module):
         if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
             self.last_dropout = nn.Dropout(config.summary_last_dropout)
 
-    def forward(self, hidden_states, token_ids=None):
+    def forward(self, hidden_states, cls_index=None):
         """ hidden_states: float Tensor in shape [bsz, seq_len, hidden_size], the hidden-states of the last layer.
-            token_ids: [optional] index of the classification token if summary_type == 'token_ids',
+            cls_index: [optional] position of the classification token if summary_type == 'cls_index',
                 shape (bsz,) or more generally (bsz, ...) where ... are optional leading dimensions of hidden_states.
-                if summary_type == 'token_ids' and token_ids is None:
+                if summary_type == 'cls_index' and cls_index is None:
                     we take the last token of the sequence as classification token
         """
         if self.summary_type == 'last':
@@ -816,14 +816,14 @@ class SequenceSummary(nn.Module):
             output = hidden_states[:, 0]
         elif self.summary_type == 'mean':
             output = hidden_states.mean(dim=1)
-        elif self.summary_type == 'token_ids':
-            if token_ids is None:
-                token_ids = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
+        elif self.summary_type == 'cls_index':
+            if cls_index is None:
+                cls_index = torch.full_like(hidden_states[..., :1, :], hidden_states.shape[-2]-1, dtype=torch.long)
             else:
-                token_ids = token_ids.unsqueeze(-1).unsqueeze(-1)
-                token_ids = token_ids.expand((-1,) * (token_ids.dim()-1) + (hidden_states.size(-1),))
-            # shape of token_ids: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
-            output = hidden_states.gather(-2, token_ids).squeeze(-2) # shape (bsz, XX, hidden_size)
+                cls_index = cls_index.unsqueeze(-1).unsqueeze(-1)
+                cls_index = cls_index.expand((-1,) * (cls_index.dim()-1) + (hidden_states.size(-1),))
+            # shape of cls_index: (bsz, XX, 1, hidden_size) where XX are optional leading dim of hidden_states
+            output = hidden_states.gather(-2, cls_index).squeeze(-2) # shape (bsz, XX, hidden_size)
         elif self.summary_type == 'attn':
             raise NotImplementedError
 

From 7c524d631e4c0fd0531d02d6a155fc95a3e90810 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 16:25:54 +0200
Subject: [PATCH 23/35] add issue templates

---
 .github/ISSUE_TEMPLATE/bug-report.md      | 36 +++++++++++++++++++
 .github/ISSUE_TEMPLATE/feature-request.md | 16 +++++++++
 .github/ISSUE_TEMPLATE/migration.md       | 43 +++++++++++++++++++++++
 .github/ISSUE_TEMPLATE/question-help.md   |  8 +++++
 4 files changed, 103 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug-report.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature-request.md
 create mode 100644 .github/ISSUE_TEMPLATE/migration.md
 create mode 100644 .github/ISSUE_TEMPLATE/question-help.md

diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
new file mode 100644
index 0000000000..0d9439887b
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -0,0 +1,36 @@
+---
+name: "\U0001F41B Bug Report"
+about: Submit a bug report to help us improve PyTorch Transformers
+---
+
+## 🐛 Bug
+
+<!-- A clear and concise description of what the bug is. -->
+
+## To Reproduce
+
+Steps to reproduce the behavior:
+
+1.
+2.
+3.
+
+<!-- If you have a code sample, error messages, stack traces, please provide it here as well. -->
+
+## Expected behavior
+
+<!-- A clear and concise description of what you expected to happen. -->
+
+## Environment
+
+* OS:
+* Python version:
+* PyTorch version:
+* PyTorch Transformers version (or branch):
+* Using GPU ?
+* Distributed of parallel setup ?
+* Any other relevant information:
+
+## Additional context
+
+<!-- Add any other context about the problem here. -->
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md
new file mode 100644
index 0000000000..828e3737be
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature-request.md
@@ -0,0 +1,16 @@
+---
+name: "\U0001F680 Feature Request"
+about: Submit a proposal/request for a new PyTorch Transformers feature
+---
+
+## 🚀 Feature
+
+<!-- A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist. -->
+
+## Motivation
+
+<!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. -->
+
+## Additional context
+
+<!-- Add any other context or screenshots about the feature request here. -->
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/migration.md b/.github/ISSUE_TEMPLATE/migration.md
new file mode 100644
index 0000000000..9a8b19dffa
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/migration.md
@@ -0,0 +1,43 @@
+---
+name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
+about: Report a problem when migrating from PyTorch-pretrained-Bert to PyTorch-Transformers
+---
+
+## 📚 Migration
+
+<!-- Give at least the following information -->
+
+Model I am using (Bert, XLNet....):
+
+The problem arise when using:
+* [ ] the official example scripts
+* [ ] my own modified scripts
+
+The tasks I am working on is:
+* [ ] an official GLUE/SQUaD task: (give the name)
+* [ ] my own task or dataset: (give details)
+
+Language I am using the model on (English, Chinese....):
+
+Details of the issue:
+
+<!-- A clear and concise description of the migration issue. If you have code snippets, please provide it here as well. -->
+
+## Environment
+
+* OS:
+* Python version:
+* PyTorch version:
+* PyTorch Transformers version (or branch):
+* Using GPU ?
+* Distributed of parallel setup ?
+* Any other relevant information:
+
+## Checklist
+
+- [ ] I have read the migration guide in the readme.
+- [ ] I checked if a related official extension example runs on my machine.
+
+## Additional context
+
+<!-- Add any other context about the problem here. -->
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/question-help.md b/.github/ISSUE_TEMPLATE/question-help.md
new file mode 100644
index 0000000000..8c76994b02
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/question-help.md
@@ -0,0 +1,8 @@
+---
+name: "❓Questions & Help"
+about: Start a general discussion related to PyTorch Transformers
+---
+
+## ❓ Questions & Help
+
+<!-- A clear and concise description of the question. -->
\ No newline at end of file

From 077ad693e9c3b5702ba9874f7a0f0ed8099c9773 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 16:46:29 +0200
Subject: [PATCH 24/35] tweak issue templates wordings

---
 .github/ISSUE_TEMPLATE/bug-report.md | 14 +++++++++++++-
 .github/ISSUE_TEMPLATE/migration.md  | 10 +++++-----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
index 0d9439887b..66f7831aea 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -5,7 +5,19 @@ about: Submit a bug report to help us improve PyTorch Transformers
 
 ## 🐛 Bug
 
-<!-- A clear and concise description of what the bug is. -->
+<!-- Important information -->
+
+Model I am using (Bert, XLNet....):
+
+Language I am using the model on (English, Chinese....):
+
+The problem arise when using:
+* [ ] the official example scripts: (give details)
+* [ ] my own modified scripts: (give details)
+
+The tasks I am working on is:
+* [ ] an official GLUE/SQUaD task: (give the name)
+* [ ] my own task or dataset: (give details)
 
 ## To Reproduce
 
diff --git a/.github/ISSUE_TEMPLATE/migration.md b/.github/ISSUE_TEMPLATE/migration.md
index 9a8b19dffa..cf0c9a4757 100644
--- a/.github/ISSUE_TEMPLATE/migration.md
+++ b/.github/ISSUE_TEMPLATE/migration.md
@@ -5,20 +5,20 @@ about: Report a problem when migrating from PyTorch-pretrained-Bert to PyTorch-T
 
 ## 📚 Migration
 
-<!-- Give at least the following information -->
+<!-- Important information -->
 
 Model I am using (Bert, XLNet....):
 
+Language I am using the model on (English, Chinese....):
+
 The problem arise when using:
-* [ ] the official example scripts
-* [ ] my own modified scripts
+* [ ] the official example scripts: (give details)
+* [ ] my own modified scripts: (give details)
 
 The tasks I am working on is:
 * [ ] an official GLUE/SQUaD task: (give the name)
 * [ ] my own task or dataset: (give details)
 
-Language I am using the model on (English, Chinese....):
-
 Details of the issue:
 
 <!-- A clear and concise description of the migration issue. If you have code snippets, please provide it here as well. -->

From 70c10caa06d9feda3f446d0a82655f56cd2afdab Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 17:09:37 +0200
Subject: [PATCH 25/35] add option mentioned in #940

---
 examples/run_glue.py  | 6 ++++++
 examples/run_squad.py | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 0d4ffaa390..a939ea373b 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -247,6 +247,9 @@ def evaluate(args, model, tokenizer, prefix=""):
 
 
 def load_and_cache_examples(args, task, tokenizer, evaluate=False):
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
     processor = processors[task]()
     output_mode = output_modes[task]
     # Load data features from cache or dataset file
@@ -273,6 +276,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
 
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
     # Convert to Tensors and build dataset
     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
     all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 7d768d2c43..e62a1f1ff3 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -272,6 +272,9 @@ def evaluate(args, model, tokenizer, prefix=""):
 
 
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
     # Load data features from cache or dataset file
     input_file = args.predict_file if evaluate else args.train_file
     cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
@@ -296,6 +299,9 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
 
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
     # Convert to Tensors and build dataset
     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
     all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)

From 7223886dc944b5476ea6be1a9838738644a2e9a1 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 17:16:56 +0200
Subject: [PATCH 26/35] fix #944

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 703eb47df9..f3d2865ba8 100644
--- a/README.md
+++ b/README.md
@@ -385,6 +385,7 @@ for batch in train_data:
     loss.backward()
     scheduler.step()
     optimizer.step()
+    optimizer.zero_grad()
 ```
 
 ## Citation

From 3a126e73dd020be851d59cfcdc741fe3e8c6ad4f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 17:26:29 +0200
Subject: [PATCH 27/35] fix #950

---
 .../convert_transfo_xl_checkpoint_to_pytorch.py          | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
index b6672aedf7..5733146444 100755
--- a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -24,11 +24,10 @@ from io import open
 import torch
 
 import pytorch_transformers.tokenization_transfo_xl as data_utils
-from pytorch_transformers.modeling_transfo_xl import (CONFIG_NAME,
-                                                         WEIGHTS_NAME,
-                                                         TransfoXLConfig,
-                                                         TransfoXLLMHeadModel,
-                                                         load_tf_weights_in_transfo_xl)
+
+from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
+from pytorch_transformers.modeling_transfo_xl import (TransfoXLConfig, TransfoXLLMHeadModel,
+                                                      load_tf_weights_in_transfo_xl)
 from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
 
 if sys.version_info[0] == 2:

From ed4e5422604b04df823eb2011e9ed4d766cf9980 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 18:14:07 +0200
Subject: [PATCH 28/35] adding tests

---
 pytorch_transformers/__init__.py              |  2 +
 pytorch_transformers/modeling_auto.py         | 27 ++++++++-
 pytorch_transformers/modeling_utils.py        |  2 +-
 .../tests/modeling_auto_test.py               | 55 +++++++++++++++++++
 4 files changed, 83 insertions(+), 3 deletions(-)
 create mode 100644 pytorch_transformers/tests/modeling_auto_test.py

diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index 72d666448e..d4ddda94fa 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -7,6 +7,8 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_utils import (PreTrainedTokenizer)
 
+from .modeling_auto import (AutoConfig, AutoModel, AutoModelForSequenceClassification, AutoModelWithLMHead)
+
 from .modeling_bert import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
                        BertForSequenceClassification, BertForMultipleChoice,
diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index 7d3ea7ec60..22a35090aa 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -393,6 +393,8 @@ class AutoModelWithLMHead(DerivedAutoModel):
 
     def __init__(self, base_model):
         super(AutoModelWithLMHead, self).__init__(base_model)
+        config = base_model.config
+
         self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 
         self.apply(self.init_weights)
@@ -426,6 +428,17 @@ class AutoModelWithLMHead(DerivedAutoModel):
         return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
 
 
+AUTO_MODEL_SEQUENCE_SUMMARY_DEFAULTS = {
+     'num_labels': 2,
+     'summary_type': 'first',
+     'summary_use_proj': True,
+     'summary_activation': None,
+     'summary_proj_to_labels': True,
+     'summary_first_dropout': 0.1
+}
+
+
+
 class AutoModelForSequenceClassification(DerivedAutoModel):
     r"""
         :class:`~pytorch_transformers.AutoModelForSequenceClassification` is a class for sequence classification
@@ -451,8 +464,18 @@ class AutoModelForSequenceClassification(DerivedAutoModel):
 
     def __init__(self, base_model):
         super(AutoModelForSequenceClassification, self).__init__(base_model)
-        self.num_labels = base_model.config.num_labels
-        self.sequence_summary = SequenceSummary(base_model.config)
+        # Complete configuration with defaults if necessary
+        config = base_model.config
+        for key, value in AUTO_MODEL_SEQUENCE_SUMMARY_DEFAULTS.items():
+            if not hasattr(config, key):
+                setattr(config, key, value)
+
+        # Update base model and derived model config
+        self.transformer.config = config
+        self.config = config
+
+        self.num_labels = config.num_labels
+        self.sequence_summary = SequenceSummary(config)
 
         self.apply(self.init_weights)
 
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 2664c542e0..f832b482af 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -777,7 +777,7 @@ class SequenceSummary(nn.Module):
         super(SequenceSummary, self).__init__()
 
         self.summary_type = config.summary_type if hasattr(config, 'summary_use_proj') else 'last'
-        if config.summary_type == 'attn':
+        if self.summary_type == 'attn':
             # We should use a standard multi-head attention module with absolute positional embedding for that.
             # Cf. https://github.com/zihangdai/xlnet/blob/master/modeling.py#L253-L276
             # We can probably just use the multi-head attention module of PyTorch >=1.1.0
diff --git a/pytorch_transformers/tests/modeling_auto_test.py b/pytorch_transformers/tests/modeling_auto_test.py
new file mode 100644
index 0000000000..07042a255c
--- /dev/null
+++ b/pytorch_transformers/tests/modeling_auto_test.py
@@ -0,0 +1,55 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+import logging
+
+from pytorch_transformers import AutoConfig, BertConfig, AutoModel, BertModel, AutoModelForSequenceClassification, AutoModelWithLMHead
+from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+
+from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
+
+
+class AutoModelTest(unittest.TestCase):
+    def test_model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            config = AutoConfig.from_pretrained(model_name)
+            self.assertIsNotNone(config)
+            self.assertIsInstance(config, BertConfig)
+
+            model = AutoModel.from_pretrained(model_name)
+            model, loading_info = AutoModel.from_pretrained(model_name, output_loading_info=True)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(model, BertModel)
+            for value in loading_info.values():
+                self.assertEqual(len(value), 0)
+
+            model = AutoModelForSequenceClassification.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(getattr(model, model.base_model_prefix), BertModel)
+
+            model = AutoModelWithLMHead.from_pretrained(model_name)
+            self.assertIsNotNone(model)
+            self.assertIsInstance(getattr(model, model.base_model_prefix), BertModel)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 13936a962102ed20424838fe5d445a28b5225d08 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 18:48:16 +0200
Subject: [PATCH 29/35] update doc and tests

---
 docs/source/index.rst                         |  1 +
 docs/source/model_doc/auto.rst                | 28 +++++++++--
 pytorch_transformers/__init__.py              |  1 +
 .../tests/tokenization_auto_test.py           | 46 +++++++++++++++++++
 4 files changed, 72 insertions(+), 4 deletions(-)
 create mode 100644 pytorch_transformers/tests/tokenization_auto_test.py

diff --git a/docs/source/index.rst b/docs/source/index.rst
index b80fd8437b..b613596331 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -40,6 +40,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
     :maxdepth: 2
     :caption: Package Reference
 
+    model_doc/auto
     model_doc/bert
     model_doc/gpt
     model_doc/transformerxl
diff --git a/docs/source/model_doc/auto.rst b/docs/source/model_doc/auto.rst
index ad439fff03..43f6e103bd 100644
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -1,9 +1,15 @@
-AutoModel, AutoConfig and AutoTokenizer - Standard derived classes
----------------------------------------------------------------
+AutoModels
+-----------
 
-In many case, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method.
+In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method.
+
+AutoClasses are here to do this job for you so that you automatically retreive the relevant model given the name/path to the pretrained weights/config/vocabulary.
+
+There are two types of AutoClasses:
+
+- ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer``: instantiating these ones will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``)
+- All the others (``AutoModelWithLMHead``, ``AutoModelForSequenceClassification``...)  are standardized Auto classes for finetuning. Instantiating these will create instance of the same class (``AutoModelWithLMHead``, ``AutoModelForSequenceClassification``...) comprising (i) the relevant base model class (as mentioned just above) and (ii) a standard fine-tuning head on top, convenient for the task.
 
-Auto classes are here to do this job for you so that you automatically retreive the relevant model given the name/path to the pretrained weights/config/vocabulary.
 
 ``AutoConfig``
 ~~~~~~~~~~~~~~~~~~~~~
@@ -19,6 +25,20 @@ Auto classes are here to do this job for you so that you automatically retreive
     :members:
 
 
+``AutoModelWithLMHead``
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.AutoModelWithLMHead
+    :members:
+
+
+``AutoModelForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.AutoModelForSequenceClassification
+    :members:
+
+
 ``AutoTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index d4ddda94fa..110c3dc3c7 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -1,4 +1,5 @@
 __version__ = "1.0.0"
+from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
diff --git a/pytorch_transformers/tests/tokenization_auto_test.py b/pytorch_transformers/tests/tokenization_auto_test.py
new file mode 100644
index 0000000000..f4f82083f2
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_auto_test.py
@@ -0,0 +1,46 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+import logging
+
+from pytorch_transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
+from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from pytorch_transformers.modeling_gpt2 import GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+class AutoTokenizerTest(unittest.TestCase):
+    def test_tokenizer_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.assertIsNotNone(tokenizer)
+            self.assertIsInstance(tokenizer, BertTokenizer)
+            self.assertGreater(len(tokenizer), 0)
+
+        for model_name in list(GPT2_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            self.assertIsNotNone(tokenizer)
+            self.assertIsInstance(tokenizer, GPT2Tokenizer)
+            self.assertGreater(len(tokenizer), 0)
+
+
+if __name__ == "__main__":
+    unittest.main()

From 0b524b084857d0bf54eb613304a61bcdbd71e6fb Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 19:08:19 +0200
Subject: [PATCH 30/35] remove derived classes for now

---
 docs/source/model_doc/auto.rst                |  21 +-
 pytorch_transformers/__init__.py              |   2 +-
 pytorch_transformers/modeling_auto.py         | 266 ------------------
 .../tests/modeling_auto_test.py               |  10 +-
 4 files changed, 4 insertions(+), 295 deletions(-)

diff --git a/docs/source/model_doc/auto.rst b/docs/source/model_doc/auto.rst
index 43f6e103bd..7b56eabafe 100644
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -3,12 +3,9 @@ AutoModels
 
 In many cases, the architecture you want to use can be guessed from the name or the path of the pretrained model you are supplying to the ``from_pretrained`` method.
 
-AutoClasses are here to do this job for you so that you automatically retreive the relevant model given the name/path to the pretrained weights/config/vocabulary.
+AutoClasses are here to do this job for you so that you automatically retreive the relevant model given the name/path to the pretrained weights/config/vocabulary:
 
-There are two types of AutoClasses:
-
-- ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer``: instantiating these ones will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``)
-- All the others (``AutoModelWithLMHead``, ``AutoModelForSequenceClassification``...)  are standardized Auto classes for finetuning. Instantiating these will create instance of the same class (``AutoModelWithLMHead``, ``AutoModelForSequenceClassification``...) comprising (i) the relevant base model class (as mentioned just above) and (ii) a standard fine-tuning head on top, convenient for the task.
+Instantiating one of ``AutoModel``, ``AutoConfig`` and ``AutoTokenizer`` will directly create a class of the relevant architecture (ex: ``model = AutoModel.from_pretrained('bert-base-cased')`` will create a instance of ``BertModel``).
 
 
 ``AutoConfig``
@@ -25,20 +22,6 @@ There are two types of AutoClasses:
     :members:
 
 
-``AutoModelWithLMHead``
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_transformers.AutoModelWithLMHead
-    :members:
-
-
-``AutoModelForSequenceClassification``
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-.. autoclass:: pytorch_transformers.AutoModelForSequenceClassification
-    :members:
-
-
 ``AutoTokenizer``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index 110c3dc3c7..04e5c3c9dd 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -8,7 +8,7 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_utils import (PreTrainedTokenizer)
 
-from .modeling_auto import (AutoConfig, AutoModel, AutoModelForSequenceClassification, AutoModelWithLMHead)
+from .modeling_auto import (AutoConfig, AutoModel)
 
 from .modeling_bert import (BertConfig, BertModel, BertForPreTraining,
                        BertForMaskedLM, BertForNextSentencePrediction,
diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index 22a35090aa..64b151e3a3 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -234,269 +234,3 @@ class AutoModel(object):
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
                          "'xlm'".format(pretrained_model_name_or_path))
-
-
-class DerivedAutoModel(PreTrainedModel):
-    r"""
-        :class:`~pytorch_transformers.DerivedAutoModel` is a base class for building
-        standardized derived models on top of :class:`~pytorch_transformers.AutoModel` by adding heads
-
-        The `from_pretrained()` method take care of using the correct base model class instance
-        using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The base model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `bert`: BertConfig (Bert model)
-            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
-            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
-            - contains `xlnet`: XLNetConfig (XLNet model)
-            - contains `xlm`: XLMConfig (XLM model)
-
-        This class should usually not be instantiated using `__init__()` but `from_pretrained()`.
-    """
-    config_class = None
-    pretrained_model_archive_map = {}
-    load_tf_weights = lambda model, config, path: None
-    base_model_prefix = "transformer"
-
-    def __init__(self, base_model):
-        super(DerivedAutoModel, self).__init__(base_model.config)
-        self.transformer = base_model
-
-    def init_weights(self, module):
-        """ Initialize the weights. Use the base model initialization function.
-        """
-        self.transformer.init_weights(module)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiate a :class:`~pytorch_transformers.DerivedAutoModel` with one of the base model classes of the library
-        from a pre-trained model configuration.
-
-        The base model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `bert`: BertConfig (Bert model)
-            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
-            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
-            - contains `xlnet`: XLNetConfig (XLNet model)
-            - contains `xlm`: XLMConfig (XLM model)
-
-            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-            To train the model, you should first set it back in training mode with `model.train()`
-
-        Params:
-            **pretrained_model_name_or_path**: either:
-                - a string with the `shortcut name` of a pre-trained model to load from cache
-                    or download and cache if not already stored in cache (e.g. 'bert-base-uncased').
-                - a path to a `directory` containing a configuration file saved
-                    using the `save_pretrained(save_directory)` method.
-                - a path or url to a tensorflow index checkpoint `file` (e.g. `./tf_model/model.ckpt.index`).
-                    In this case, ``from_tf`` should be set to True and a configuration object should be
-                    provided as `config` argument. This loading option is slower than converting the TensorFlow
-                    checkpoint in a PyTorch model using the provided conversion scripts and loading
-                    the PyTorch model afterwards.
-            **model_args**: (`optional`) Sequence:
-                All remaning positional arguments will be passed to the underlying model's __init__ function
-            **config**: an optional configuration for the model to use instead of an automatically loaded configuation.
-                Configuration can be automatically loaded when:
-                - the model is a model provided by the library (loaded with a `shortcut name` of a pre-trained model), or
-                - the model was saved using the `save_pretrained(save_directory)` (loaded by suppling the save directory).
-            **state_dict**: an optional state dictionnary for the model to use instead of a state dictionary loaded
-                from saved weights file.
-                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
-                In this case though, you should check if using `save_pretrained(dir)` and `from_pretrained(save_directory)` is not
-                a simpler option.
-            **cache_dir**: (`optional`) string:
-                Path to a directory in which a downloaded pre-trained model
-                configuration should be cached if the standard cache should not be used.
-            **output_loading_info**: (`optional`) boolean:
-                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
-            **kwargs**: (`optional`) dict:
-                Dictionary of key, values to update the configuration object after loading.
-                Can be used to override selected configuration parameters. E.g. ``output_attention=True``.
-
-               - If a configuration is provided with `config`, **kwargs will be directly passed
-                 to the underlying model's __init__ method.
-               - If a configuration is not provided, **kwargs will be first passed to the pretrained
-                 model configuration class loading function (`PretrainedConfig.from_pretrained`).
-                 Each key of **kwargs that corresponds to a configuration attribute
-                 will be used to override said attribute with the supplied **kwargs value.
-                 Remaining keys that do not correspond to any configuration attribute will
-                 be passed to the underlying model's __init__ function.
-
-        Examples::
-
-            model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
-        """
-        if 'bert' in pretrained_model_name_or_path:
-            base_model_class = BertModel
-        elif 'openai-gpt' in pretrained_model_name_or_path:
-            base_model_class = OpenAIGPTModel
-        elif 'gpt2' in pretrained_model_name_or_path:
-            base_model_class = GPT2Model
-        elif 'transfo-xl' in pretrained_model_name_or_path:
-            base_model_class = TransfoXLModel
-        elif 'xlnet' in pretrained_model_name_or_path:
-            base_model_class = XLNetModel
-        elif 'xlm' in pretrained_model_name_or_path:
-            base_model_class = XLMModel
-        else:
-            raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                            "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                            "'xlm'".format(pretrained_model_name_or_path))
-
-        # Get a pretrained base_model
-        base_model = base_model_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-
-        # Create our derived model
-        model = cls(base_model)
-
-        # Setup class attribute from the base model class
-        model.config_class = base_model.config_class
-        model.pretrained_model_archive_map = base_model.pretrained_model_archive_map
-        model.load_tf_weights = base_model.load_tf_weights
-
-        return model
-
-
-class AutoModelWithLMHead(DerivedAutoModel):
-    r"""
-        :class:`~pytorch_transformers.AutoModelWithLMHead` is a base class for language modeling
-        that contains
-        
-            - a base model instantiated as one of the base model classes of the library when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)` class method, and .
-            - a language modeling head on top of the base model.
-
-        The `from_pretrained()` method take care of using the correct base model class instance
-        using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The base model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `bert`: BertConfig (Bert model)
-            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
-            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
-            - contains `xlnet`: XLNetConfig (XLNet model)
-            - contains `xlm`: XLMConfig (XLM model)
-
-        This class should usually not be instantiated using `__init__()` but `from_pretrained()`.
-    """
-
-    def __init__(self, base_model):
-        super(AutoModelWithLMHead, self).__init__(base_model)
-        config = base_model.config
-
-        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
-
-        self.apply(self.init_weights)
-        self.tie_weights()
-
-    def tie_weights(self):
-        """ Make sure we are sharing the input and output embeddings.
-            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
-        """
-        # get input embeddings - whatever the model is
-        input_embeddings = self.transformer.resize_token_embeddings(new_num_tokens=None)
-
-        # tie of clone (torchscript) embeddings
-        self._tie_or_clone_weights(self.lm_head, input_embeddings)
-
-    def forward(self, input_ids, **kwargs):
-        labels = kwargs.pop('labels', None)  # Python 2 compatibility...
-
-        transformer_outputs = self.transformer(input_ids, **kwargs)
-        hidden_states = transformer_outputs[0]
-
-        lm_logits = self.lm_head(hidden_states)
-
-        outputs = (lm_logits,) + transformer_outputs[1:]
-        if labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)),
-                            labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs  # (loss), lm_logits, presents, (all hidden_states), (attentions)
-
-
-AUTO_MODEL_SEQUENCE_SUMMARY_DEFAULTS = {
-     'num_labels': 2,
-     'summary_type': 'first',
-     'summary_use_proj': True,
-     'summary_activation': None,
-     'summary_proj_to_labels': True,
-     'summary_first_dropout': 0.1
-}
-
-
-
-class AutoModelForSequenceClassification(DerivedAutoModel):
-    r"""
-        :class:`~pytorch_transformers.AutoModelForSequenceClassification` is a class for sequence classification
-        that contains
-        
-            - a base model instantiated as one of the base model classes of the library when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)` class method, and .
-            - a classification head on top of the base model.
-
-        The `from_pretrained()` method take care of using the correct base model class instance
-        using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The base model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `bert`: BertConfig (Bert model)
-            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
-            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
-            - contains `xlnet`: XLNetConfig (XLNet model)
-            - contains `xlm`: XLMConfig (XLM model)
-
-        This class should usually not be instantiated using `__init__()` but `from_pretrained()`.
-    """
-
-    def __init__(self, base_model):
-        super(AutoModelForSequenceClassification, self).__init__(base_model)
-        # Complete configuration with defaults if necessary
-        config = base_model.config
-        for key, value in AUTO_MODEL_SEQUENCE_SUMMARY_DEFAULTS.items():
-            if not hasattr(config, key):
-                setattr(config, key, value)
-
-        # Update base model and derived model config
-        self.transformer.config = config
-        self.config = config
-
-        self.num_labels = config.num_labels
-        self.sequence_summary = SequenceSummary(config)
-
-        self.apply(self.init_weights)
-
-    def forward(self, input_ids, cls_index, **kwargs):
-        labels = kwargs.pop('labels', None)  # Python 2 compatibility...
-
-        transformer_outputs = self.transformer(input_ids, **kwargs)
-
-        output = transformer_outputs[0]
-        logits = self.sequence_summary(output, cls_index=cls_index)
-
-        outputs = (logits,) + transformer_outputs[1:]  # Keep new_mems and attention/hidden states if they are here
-
-        if labels is not None:
-            if self.num_labels == 1:
-                #  We are doing regression
-                loss_fct = MSELoss()
-                loss = loss_fct(logits.view(-1), labels.view(-1))
-            else:
-                loss_fct = CrossEntropyLoss()
-                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
-            outputs = (loss,) + outputs
-
-        return outputs
diff --git a/pytorch_transformers/tests/modeling_auto_test.py b/pytorch_transformers/tests/modeling_auto_test.py
index 07042a255c..d0c830abc7 100644
--- a/pytorch_transformers/tests/modeling_auto_test.py
+++ b/pytorch_transformers/tests/modeling_auto_test.py
@@ -21,7 +21,7 @@ import shutil
 import pytest
 import logging
 
-from pytorch_transformers import AutoConfig, BertConfig, AutoModel, BertModel, AutoModelForSequenceClassification, AutoModelWithLMHead
+from pytorch_transformers import AutoConfig, BertConfig, AutoModel, BertModel
 from pytorch_transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
@@ -42,14 +42,6 @@ class AutoModelTest(unittest.TestCase):
             for value in loading_info.values():
                 self.assertEqual(len(value), 0)
 
-            model = AutoModelForSequenceClassification.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(getattr(model, model.base_model_prefix), BertModel)
-
-            model = AutoModelWithLMHead.from_pretrained(model_name)
-            self.assertIsNotNone(model)
-            self.assertIsInstance(getattr(model, model.base_model_prefix), BertModel)
-
 
 if __name__ == "__main__":
     unittest.main()

From beb03ec6c56e12b87fd94b97a36221b976b65651 Mon Sep 17 00:00:00 2001
From: wangfei <1140554608@qq.com>
Date: Tue, 6 Aug 2019 11:15:57 +0800
Subject: [PATCH 31/35] Fix examples of loading pretrained models in docstring

---
 pytorch_transformers/modeling_bert.py       | 107 +++++++++-----------
 pytorch_transformers/modeling_gpt2.py       |  37 ++++---
 pytorch_transformers/modeling_openai.py     |  37 ++++---
 pytorch_transformers/modeling_transfo_xl.py |  22 ++--
 pytorch_transformers/modeling_xlm.py        |  52 +++++-----
 pytorch_transformers/modeling_xlnet.py      |  62 +++++-------
 6 files changed, 141 insertions(+), 176 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 3f2e7cbda1..6e2df0d2fa 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -643,12 +643,11 @@ class BertModel(BertPreTrainedModel):
 
     Examples::
 
-        config = BertConfig.from_pretrained('bert-base-uncased')
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertModel(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertModel.from_pretrained('bert-base-uncased')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -754,13 +753,11 @@ class BertForPreTraining(BertPreTrainedModel):
 
     Examples::
 
-        config = BertConfig.from_pretrained('bert-base-uncased')
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        
-        model = BertForPreTraining(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, seq_relationship_scores = outputs[:2]
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_scores, seq_relationship_scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -824,13 +821,11 @@ class BertForMaskedLM(BertPreTrainedModel):
 
     Examples::
 
-        config = BertConfig.from_pretrained('bert-base-uncased')
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        
-        model = BertForMaskedLM(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, masked_lm_labels=input_ids)
-        loss, prediction_scores = outputs[:2]
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForMaskedLM.from_pretrained('bert-base-uncased')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, masked_lm_labels=input_ids)
+        >>> loss, prediction_scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -891,13 +886,11 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
 
     Examples::
 
-        config = BertConfig.from_pretrained('bert-base-uncased')
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        
-        model = BertForNextSentencePrediction(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        seq_relationship_scores = outputs[0]
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> seq_relationship_scores = outputs[0]
 
     """
     def __init__(self, config):
@@ -951,14 +944,12 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
     Examples::
 
-        config = BertConfig.from_pretrained('bert-base-uncased')
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        
-        model = BertForSequenceClassification(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=labels)
+        >>> loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1057,15 +1048,13 @@ class BertForMultipleChoice(BertPreTrainedModel):
 
     Examples::
 
-        config = BertConfig.from_pretrained('bert-base-uncased')
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        
-        model = BertForMultipleChoice(config)
-        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, classification_scores = outputs[:2]
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
+        >>> choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
+        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        >>> labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=labels)
+        >>> loss, classification_scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1127,14 +1116,12 @@ class BertForTokenClassification(BertPreTrainedModel):
 
     Examples::
 
-        config = BertConfig.from_pretrained('bert-base-uncased')
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        
-        model = BertForTokenClassification(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, scores = outputs[:2]
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForTokenClassification.from_pretrained('bert-base-uncased')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=labels)
+        >>> loss, scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1203,15 +1190,13 @@ class BertForQuestionAnswering(BertPreTrainedModel):
 
     Examples::
 
-        config = BertConfig.from_pretrained('bert-base-uncased')
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        
-        model = BertForQuestionAnswering(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss, start_scores, end_scores = outputs[:2]
+        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        >>> model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        >>> loss, start_scores, end_scores = outputs[:2]
 
     """
     def __init__(self, config):
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 5268c5de7d..9800b6658f 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -433,12 +433,11 @@ class GPT2Model(GPT2PreTrainedModel):
 
     Examples::
 
-        config = GPT2Config.from_pretrained('gpt2')
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2Model(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = GPT2Model.from_pretrained('gpt2')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -567,12 +566,11 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
 
     Examples::
 
-        config = GPT2Config.from_pretrained('gpt2')
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2LMHeadModel(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = GPT2LMHeadModel.from_pretrained('gpt2')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=input_ids)
+        >>> loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -683,14 +681,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
 
     Examples::
 
-        config = GPT2Config.from_pretrained('gpt2')
-        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        model = GPT2DoubleHeadsModel(config)
-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
-        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
+        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        >>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, mc_token_ids)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
     def __init__(self, config):
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 187c51c86e..500f455816 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -439,12 +439,11 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
     Examples::
 
-        config = OpenAIGPTConfig.from_pretrained('openai-gpt')
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTModel(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        >>> model = OpenAIGPTModel.from_pretrained('openai-gpt')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -558,12 +557,11 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
 
     Examples::
 
-        config = OpenAIGPTConfig.from_pretrained('openai-gpt')
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTLMHeadModel(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=input_ids)
-        loss, logits = outputs[:2]
+        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        >>> model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=input_ids)
+        >>> loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -665,14 +663,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
 
     Examples::
 
-        config = OpenAIGPTConfig.from_pretrained('openai-gpt')
-        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        model = OpenAIGPTDoubleHeadsModel(config)
-        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
-        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, mc_token_ids)
-        lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        >>> model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
+        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        >>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, mc_token_ids)
+        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
     def __init__(self, config):
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 7c999edda7..927cc79fe6 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -968,12 +968,11 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
     Examples::
 
-        config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TransfoXLModel(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states, mems = outputs[:2]
+        >>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        >>> model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states, mems = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1284,12 +1283,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
 
     Examples::
 
-        config = TransfoXLConfig.from_pretrained('transfo-xl-wt103')
-        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        model = TransfoXLLMHeadModel(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        prediction_scores, mems = outputs[:2]
+        >>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        >>> model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> prediction_scores, mems = outputs[:2]
 
     """
     def __init__(self, config):
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 7325ff7875..ddf5fee328 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -472,12 +472,11 @@ class XLMModel(XLMPreTrainedModel):
 
     Examples::
 
-        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMModel(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> model = XLMModel.from_pretrained('xlm-mlm-en-2048')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     ATTRIBUTES = ['encoder', 'eos_index', 'pad_index',  # 'with_output', 
@@ -745,12 +744,11 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
 
     Examples::
 
-        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        model = XLMWithLMHeadModel(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -805,14 +803,12 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
 
     Examples::
 
-        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        
-        model = XLMForSequenceClassification(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=labels)
+        >>> loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -885,15 +881,13 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
 
     Examples::
 
-        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        
-        model = XLMForQuestionAnswering(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss, start_scores, end_scores = outputs[:2]
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        >>> loss, start_scores, end_scores = outputs[:2]
 
     """
     def __init__(self, config):
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 9c1752eb74..5b3e049ddf 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -712,12 +712,11 @@ class XLNetModel(XLNetPreTrainedModel):
 
     Examples::
 
-        config = XLNetConfig.from_pretrained('xlnet-large-cased')
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetModel(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        >>> model = XLNetModel.from_pretrained('xlnet-large-cased')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids)
+        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -1019,17 +1018,16 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
     Examples::
 
-        config = XLNetConfig.from_pretrained('xlnet-large-cased')
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        model = XLNetLMHeadModel(config)
-        # We show how to setup inputs to predict a next token using a bi-directional context.
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0)  # We will predict the masked token
-        perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
-        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-        target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
-        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
-        outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
-        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        >>> model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
+        >>> # We show how to setup inputs to predict a next token using a bi-directional context.
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0)  # We will predict the masked token
+        >>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
+        >>> perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+        >>> target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
+        >>> target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+        >>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
+        >>> next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
 
     """
     def __init__(self, config):
@@ -1100,14 +1098,12 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
     Examples::
 
-        config = XLNetConfig.from_pretrained('xlnet-large-cased')
-        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        
-        model = XLNetForSequenceClassification(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, labels=labels)
-        loss, logits = outputs[:2]
+        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        >>> model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        >>> outputs = model(input_ids, labels=labels)
+        >>> loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1200,15 +1196,13 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
 
     Examples::
 
-        config = XLMConfig.from_pretrained('xlm-mlm-en-2048')
-        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        
-        model = XLMForQuestionAnswering(config)
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        start_positions = torch.tensor([1])
-        end_positions = torch.tensor([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss, start_scores, end_scores = outputs[:2]
+        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        >>> model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
+        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        >>> start_positions = torch.tensor([1])
+        >>> end_positions = torch.tensor([3])
+        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        >>> loss, start_scores, end_scores = outputs[:2]
 
     """
     def __init__(self, config):

From f889e77b9c3e8043a30f909f3e4e3c0a016ff6df Mon Sep 17 00:00:00 2001
From: wangfei <1140554608@qq.com>
Date: Tue, 6 Aug 2019 11:30:35 +0800
Subject: [PATCH 32/35] Fix examples of loading pretrained models in docstring

---
 pytorch_transformers/modeling_gpt2.py | 34 +++++++++++++--------------
 1 file changed, 17 insertions(+), 17 deletions(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 9800b6658f..50cb834400 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -433,11 +433,11 @@ class GPT2Model(GPT2PreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = GPT2Model.from_pretrained('gpt2')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = GPT2Model.from_pretrained('gpt2')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -566,11 +566,11 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = GPT2LMHeadModel.from_pretrained('gpt2')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=input_ids)
-        >>> loss, logits = outputs[:2]
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = GPT2LMHeadModel.from_pretrained('gpt2')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=input_ids)
+        loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -681,13 +681,13 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        >>> model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
-        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
-        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        >>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, mc_token_ids)
-        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        model = GPT2DoubleHeadsModel.from_pretrained('gpt2')
+        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
+        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, mc_token_ids)
+        lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
     def __init__(self, config):

From 72622926e59056b72cec8e95d4e45ee0927f20aa Mon Sep 17 00:00:00 2001
From: wangfei <1140554608@qq.com>
Date: Tue, 6 Aug 2019 11:32:41 +0800
Subject: [PATCH 33/35] Fix examples in docstring

---
 pytorch_transformers/modeling_transfo_xl.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 927cc79fe6..cb5416964c 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -968,11 +968,11 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        >>> model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states, mems = outputs[:2]
+        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        model = TransfoXLModel.from_pretrained('transfo-xl-wt103')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states, mems = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1283,11 +1283,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
-        >>> model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> prediction_scores, mems = outputs[:2]
+        tokenizer = TransfoXLTokenizer.from_pretrained('transfo-xl-wt103')
+        model = TransfoXLLMHeadModel.from_pretrained('transfo-xl-wt103')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores, mems = outputs[:2]
 
     """
     def __init__(self, config):

From 6ec1ee9ec28ead1a7c065153df32271ead95b417 Mon Sep 17 00:00:00 2001
From: wangfei <1140554608@qq.com>
Date: Tue, 6 Aug 2019 11:32:54 +0800
Subject: [PATCH 34/35] Fix examples in docstring

---
 pytorch_transformers/modeling_bert.py   | 92 ++++++++++++-------------
 pytorch_transformers/modeling_openai.py | 34 ++++-----
 pytorch_transformers/modeling_xlm.py    | 46 ++++++-------
 pytorch_transformers/modeling_xlnet.py  | 56 +++++++--------
 4 files changed, 114 insertions(+), 114 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 6e2df0d2fa..3a6a50d0ed 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -643,11 +643,11 @@ class BertModel(BertPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertModel.from_pretrained('bert-base-uncased')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertModel.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -753,11 +753,11 @@ class BertForPreTraining(BertPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForPreTraining.from_pretrained('bert-base-uncased')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> prediction_scores, seq_relationship_scores = outputs[:2]
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForPreTraining.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores, seq_relationship_scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -821,11 +821,11 @@ class BertForMaskedLM(BertPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForMaskedLM.from_pretrained('bert-base-uncased')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, masked_lm_labels=input_ids)
-        >>> loss, prediction_scores = outputs[:2]
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForMaskedLM.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, masked_lm_labels=input_ids)
+        loss, prediction_scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -886,11 +886,11 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> seq_relationship_scores = outputs[0]
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        seq_relationship_scores = outputs[0]
 
     """
     def __init__(self, config):
@@ -944,12 +944,12 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
-        >>> loss, logits = outputs[:2]
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForSequenceClassification.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1048,13 +1048,13 @@ class BertForMultipleChoice(BertPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
-        >>> choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
-        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        >>> labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
-        >>> loss, classification_scores = outputs[:2]
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForMultipleChoice.from_pretrained('bert-base-uncased')
+        choices = ["Hello, my dog is cute", "Hello, my cat is amazing"]
+        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, classification_scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1116,12 +1116,12 @@ class BertForTokenClassification(BertPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForTokenClassification.from_pretrained('bert-base-uncased')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
-        >>> loss, scores = outputs[:2]
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForTokenClassification.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1190,13 +1190,13 @@ class BertForQuestionAnswering(BertPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        >>> model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
-        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        >>> loss, start_scores, end_scores = outputs[:2]
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertForQuestionAnswering.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
 
     """
     def __init__(self, config):
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 500f455816..20faf39972 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -439,11 +439,11 @@ class OpenAIGPTModel(OpenAIGPTPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        >>> model = OpenAIGPTModel.from_pretrained('openai-gpt')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        model = OpenAIGPTModel.from_pretrained('openai-gpt')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -557,11 +557,11 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        >>> model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=input_ids)
-        >>> loss, logits = outputs[:2]
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        model = OpenAIGPTLMHeadModel.from_pretrained('openai-gpt')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=input_ids)
+        loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -663,13 +663,13 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
-        >>> model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
-        >>> choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
-        >>> input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
-        >>> mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, mc_token_ids)
-        >>> lm_prediction_scores, mc_prediction_scores = outputs[:2]
+        tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')
+        model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
+        choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]  # Assume you've added [CLS] to the vocabulary
+        input_ids = torch.tensor([tokenizer.encode(s) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        mc_token_ids = torch.tensor([-1, -1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, mc_token_ids)
+        lm_prediction_scores, mc_prediction_scores = outputs[:2]
 
     """
     def __init__(self, config):
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index ddf5fee328..03af828b9d 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -472,11 +472,11 @@ class XLMModel(XLMPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> model = XLMModel.from_pretrained('xlm-mlm-en-2048')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMModel.from_pretrained('xlm-mlm-en-2048')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     ATTRIBUTES = ['encoder', 'eos_index', 'pad_index',  # 'with_output', 
@@ -744,11 +744,11 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMWithLMHeadModel.from_pretrained('xlm-mlm-en-2048')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -803,12 +803,12 @@ class XLMForSequenceClassification(XLMPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
-        >>> loss, logits = outputs[:2]
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMForSequenceClassification.from_pretrained('xlm-mlm-en-2048')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -881,13 +881,13 @@ class XLMForQuestionAnswering(XLMPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
-        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        >>> loss, start_scores, end_scores = outputs[:2]
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMForQuestionAnswering.from_pretrained('xlm-mlm-en-2048')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
 
     """
     def __init__(self, config):
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index 5b3e049ddf..f5dafe16fc 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -712,11 +712,11 @@ class XLNetModel(XLNetPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        >>> model = XLNetModel.from_pretrained('xlnet-large-cased')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids)
-        >>> last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = XLNetModel.from_pretrained('xlnet-large-cased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config):
@@ -1018,16 +1018,16 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        >>> model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
-        >>> # We show how to setup inputs to predict a next token using a bi-directional context.
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0)  # We will predict the masked token
-        >>> perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
-        >>> perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
-        >>> target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
-        >>> target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
-        >>> outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
-        >>> next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = XLNetLMHeadModel.from_pretrained('xlnet-large-cased')
+        # We show how to setup inputs to predict a next token using a bi-directional context.
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is very <mask>")).unsqueeze(0)  # We will predict the masked token
+        perm_mask = torch.zeros((1, input_ids.shape[1], input_ids.shape[1]), dtype=torch.float)
+        perm_mask[:, :, -1] = 1.0  # Previous tokens don't see last token
+        target_mapping = torch.zeros((1, 1, input_ids.shape[1]), dtype=torch.float)  # Shape [1, 1, seq_length] => let's predict one token
+        target_mapping[0, 0, -1] = 1.0  # Our first (and only) prediction will be the last token of the sequence (the masked token)
+        outputs = model(input_ids, perm_mask=perm_mask, target_mapping=target_mapping)
+        next_token_logits = outputs[0]  # Output has shape [target_mapping.size(0), target_mapping.size(1), config.vocab_size]
 
     """
     def __init__(self, config):
@@ -1098,12 +1098,12 @@ class XLNetForSequenceClassification(XLNetPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
-        >>> model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
-        >>> outputs = model(input_ids, labels=labels)
-        >>> loss, logits = outputs[:2]
+        tokenizer = XLNetTokenizer.from_pretrained('xlnet-large-cased')
+        model = XLNetForSequenceClassification.from_pretrained('xlnet-large-cased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
 
     """
     def __init__(self, config):
@@ -1196,13 +1196,13 @@ class XLNetForQuestionAnswering(XLNetPreTrainedModel):
 
     Examples::
 
-        >>> tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
-        >>> model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
-        >>> input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        >>> start_positions = torch.tensor([1])
-        >>> end_positions = torch.tensor([3])
-        >>> outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        >>> loss, start_scores, end_scores = outputs[:2]
+        tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
+        model = XLMForQuestionAnswering.from_pretrained('xlnet-large-cased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        start_positions = torch.tensor([1])
+        end_positions = torch.tensor([3])
+        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        loss, start_scores, end_scores = outputs[:2]
 
     """
     def __init__(self, config):

From a6f412da01d15cdd60e242e9765d4dfc175adb24 Mon Sep 17 00:00:00 2001
From: Christopher Goh <chrisgzf@gmail.com>
Date: Wed, 7 Aug 2019 02:19:14 +0800
Subject: [PATCH 35/35] Fixed typo in migration guide

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 38143d0b1f..b86a5238c2 100644
--- a/README.md
+++ b/README.md
@@ -314,7 +314,7 @@ loss = outputs[0]
 # In pytorch-transformers you can also have access to the logits:
 loss, logits = outputs[:2]
 
-# And even the attention weigths if you configure the model to output them (and other outputs too, see the docstrings and documentation)
+# And even the attention weights if you configure the model to output them (and other outputs too, see the docstrings and documentation)
 model = BertForSequenceClassification.from_pretrained('bert-base-uncased', output_attentions=True)
 outputs = model(input_ids, labels=labels)
 loss, logits, attentions = outputs