From 117ed92992a8b7ec45b399a2b5e2f9b66358a7d4 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 16 Jul 2019 11:58:47 -0400
Subject: [PATCH 01/23] RestructuredText table for pretrained models.

---
 docs/source/pretrained_models.rst | 147 +++++++++++++++++++-----------
 1 file changed, 94 insertions(+), 53 deletions(-)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index 2d72977951..e4ad7a6eaa 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -3,57 +3,98 @@ Pretrained models
 
 Here is the full list of the currently provided pretrained models together with a short presentation of each model.
 
-+===============+============================================================+===========================+ 
-| Architecture  | Shortcut name                                              | Details of the model      |
-+===============+============================================================+===========================+ 
-|               | ``bert-base-uncased``                                      | 12-layer, 768-hidden, 12-heads, 110M parameters
-|               |                                                            | Trained on lower-cased English text                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-large-uncased``                                     | 24-layer, 1024-hidden, 16-heads, 340M parameters
-|               |                                                            | Trained on lower-cased English text                  |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-base-cased``                                        | 12-layer, 768-hidden, 12-heads, 110M parameters
-|               |                                                            | Trained on cased English text                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-large-cased``                                       | 24-layer, 1024-hidden, 16-heads, 340M parameters                  |
-|               |                                                            | Trained on cased English text                  |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-base-multilingual-uncased``                         | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters
-|               |                                                            | Trained on lower-cased text in the top 102 languages with the largest Wikipedias
-|               |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`_)                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-base-multilingual-cased``                           | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters                  |
-|               |                                                            | Trained on cased text in the top 104 languages with the largest Wikipedias
-|               |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`_)                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|    BERT       | ``bert-base-chinese``                                      | 12-layer, 768-hidden, 12-heads, 110M parameters                  |
-|               |                                                            | Trained on cased Chinese Simplified and Traditional text |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-base-german-cased``                                 | 12-layer, 768-hidden, 12-heads, 110M parameters                  |
-|               |                                                            | Trained on cased German text by Deepset.ai |
-|               |                                                            | (see `details on deepset.ai website <https://deepset.ai/german-bert>`_)                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-large-uncased-whole-word-masking``                  | 24-layer, 1024-hidden, 16-heads, 340M parameters                  |
-|               |                                                            | Trained on lower-cased English text using Whole-Word-Masking                  |
-|               |                                                            | (see `details <https://github.com/google-research/bert/#bert>`_)                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-large-cased-whole-word-masking``                    | 24-layer, 1024-hidden, 16-heads, 340M parameters                  |
-|               |                                                            | Trained on cased English text using Whole-Word-Masking                  |
-|               |                                                            | (see `details <https://github.com/google-research/bert/#bert>`_)                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | 24-layer, 1024-hidden, 16-heads, 340M parameters                  |
-|               |                                                            | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                  |
-|               |                                                            | (see details of fine-tuning in the `example section`_)                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-large-cased-whole-word-masking-finetuned-squad``    | 24-layer, 1024-hidden, 16-heads, 340M parameters                  |
-|               |                                                            | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                  |
-|               |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`_)                 |
-|               +------------------------------------------------------------+---------------------------+ 
-|               | ``bert-base-cased-finetuned-mrpc``                         | 12-layer, 768-hidden, 12-heads, 110M parameters                  |
-|               |                                                            | The ``bert-base-cased`` model fine-tuned on MRPC                  |
-|               |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`_)                 |
-+---------------+------------------------------------------------------------+---------------------------+ 
-|    GPT        | Cells may span columns.                                                                |
-+---------------+----------------------------------------------------------------------------------------+ 
 
-.. <https://huggingface.co/pytorch-transformers/examples.html>`_
\ No newline at end of file
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+| Architecture      | Shortcut name                                              | Details of the model                                                                                                      |
++===================+============================================================+===========================================================================================================================+
+| BERT              | ``bert-base-uncased``                                      | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
+|                   |                                                            | Trained on lower-cased English text                                                                                       |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-uncased``                                     | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
+|                   |                                                            | Trained on lower-cased English text                                                                                       |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-cased``                                        | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
+|                   |                                                            | Trained on cased English text                                                                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-cased``                                       | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
+|                   |                                                            | Trained on cased English text                                                                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-multilingual-uncased``                         | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters                                               |
+|                   |                                                            | Trained on lower-cased text in the top 102 languages with the largest Wikipedias                                          |
+|                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__)                                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-multilingual-cased``                           | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters                                                    |
+|                   |                                                            | Trained on cased text in the top 104 languages with the largest Wikipedias                                                |
+|                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__)                                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-chinese``                                      | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
+|                   |                                                            | Trained on cased Chinese Simplified and Traditional text                                                                  |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-german-cased``                                 | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
+|                   |                                                            | Trained on cased German text by Deepset.ai                                                                                |
+|                   |                                                            | (see `details on deepset.ai website <https://deepset.ai/german-bert>`__)                                                  |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-uncased-whole-word-masking``                  | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
+|                   |                                                            | Trained on lower-cased English text using Whole-Word-Masking                                                              |
+|                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__)                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-cased-whole-word-masking``                    | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
+|                   |                                                            | Trained on cased English text using Whole-Word-Masking                                                                    |
+|                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__)                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
+|                   |                                                            | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                   |
+|                   |                                                            | (see details of fine-tuning in the `example section`__)                                                                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
+|                   |                                                            | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                     |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)       |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-cased-finetuned-mrpc``                         | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
+|                   |                                                            | The ``bert-base-cased`` model fine-tuned on MRPC                                                                          |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)       |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+| GPT               | ``openai-gpt``                                             | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
+|                   |                                                            | OpenAI GPT English model                                                                                                  |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+| GPT-2             | ``gpt2``                                                   | 12-layer, 768-hidden, 12-heads, 117M parameters                                                                           |
+|                   |                                                            | OpenAI GPT-2 English model                                                                                                |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``gpt2-medium``                                            | 24-layer, 1024-hidden, 16-heads, 345M parameters                                                                          |
+|                   |                                                            | OpenAI's Medium-sized GPT-2 English model                                                                                 |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+| Transformer-XL    | ``transfo-xl-wt103``                                       | 18-layer, 1024-hidden, 16-heads, 257M parameters                                                                          |
+|                   |                                                            | English model trained on wikitext-103                                                                                     |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+| XLNet             | ``xlnet-base-cased``                                       | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
+|                   |                                                            | XLNet English model                                                                                                       |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlnet-large-cased``                                      | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
+|                   |                                                            | XLNet Large English model                                                                                                 |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+| XLM               | ``xlm-mlm-en-2048``                                        | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM English model                                                                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-ende-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM English-German Multi-language model                                                                                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-enfr-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM English-French Multi-language model                                                                                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-enro-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM English-Romanian Multi-language model                                                                                 |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-xnli15-1024``                                    | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM Model pre-trained with MLM on the `15 XNLI languages<https://github.com/facebookresearch/XNLI>`__.                    |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-tlm-xnli15-1024``                                | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages<https://github.com/facebookresearch/XNLI>`__.              |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-clm-enfr-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM English model trained with CLM (Causal Language Modeling)                                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-clm-ende-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
+|                   |                                                            | XLM English-German Multi-language model trained with CLM (Causal Language Modeling)                                       |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
+
+.. <https://huggingface.co/pytorch-transformers/examples.html>`__
\ No newline at end of file

From 9d381e7be9fb97e09777fa66aa3e336ca132af70 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 17 Jul 2019 09:25:38 -0400
Subject: [PATCH 02/23] Fixed incorrect links in the PretrainedModel

---
 docs/source/pretrained_models.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index e4ad7a6eaa..b23a96ff7c 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -43,8 +43,8 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__)                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
-|                   |                                                            | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                   |
-|                   |                                                            | (see details of fine-tuning in the `example section`__)                                                                   |
+|                   |                                                            | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD (see details of fine-tuning in the                |
+|                   |                                                            | `example section <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`__)                           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
 |                   |                                                            | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                     |
@@ -85,10 +85,10 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | XLM English-Romanian Multi-language model                                                                                 |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-xnli15-1024``                                    | 12-layer, 1024-hidden, 8-heads                                                                                            |
-|                   |                                                            | XLM Model pre-trained with MLM on the `15 XNLI languages<https://github.com/facebookresearch/XNLI>`__.                    |
+|                   |                                                            | XLM Model pre-trained with MLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-tlm-xnli15-1024``                                | 12-layer, 1024-hidden, 8-heads                                                                                            |
-|                   |                                                            | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages<https://github.com/facebookresearch/XNLI>`__.              |
+|                   |                                                            | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-clm-enfr-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
 |                   |                                                            | XLM English model trained with CLM (Causal Language Modeling)                                                             |

From 1383c7b87af19bf21adf19d66cf6ee1a80555ea4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 23 Jul 2019 17:52:20 +0200
Subject: [PATCH 03/23] Fix #869

---
 pytorch_transformers/modeling_utils.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 324cdc17c9..3f1df0a49d 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -39,6 +39,20 @@ WEIGHTS_NAME = "pytorch_model.bin"
 TF_WEIGHTS_NAME = 'model.ckpt'
 
 
+try:
+    from torch.nn import Identity
+except ImportError:
+    # Older PyTorch compatibility
+    class Identity(nn.Module):
+        r"""A placeholder identity operator that is argument-insensitive.
+        """
+        def __init__(self, *args, **kwargs):
+            super(Identity, self).__init__()
+
+        def forward(self, input):
+            return input
+
+
 if not six.PY2:
     def add_start_docstrings(*docstr):
         def docstring_decorator(fn):
@@ -731,7 +745,7 @@ class SequenceSummary(nn.Module):
             # We can probably just use the multi-head attention module of PyTorch >=1.1.0
             raise NotImplementedError
 
-        self.summary = nn.Identity()
+        self.summary = Identity()
         if hasattr(config, 'summary_use_proj') and config.summary_use_proj:
             if hasattr(config, 'summary_proj_to_labels') and config.summary_proj_to_labels and config.num_labels > 0:
                 num_classes = config.num_labels
@@ -739,15 +753,15 @@ class SequenceSummary(nn.Module):
                 num_classes = config.hidden_size
             self.summary = nn.Linear(config.hidden_size, num_classes)
 
-        self.activation = nn.Identity()
+        self.activation = Identity()
         if hasattr(config, 'summary_activation') and config.summary_activation == 'tanh':
             self.activation = nn.Tanh()
 
-        self.first_dropout = nn.Identity()
+        self.first_dropout = Identity()
         if hasattr(config, 'summary_first_dropout') and config.summary_first_dropout > 0:
             self.first_dropout = nn.Dropout(config.summary_first_dropout)
 
-        self.last_dropout = nn.Identity()
+        self.last_dropout = Identity()
         if hasattr(config, 'summary_last_dropout') and config.summary_last_dropout > 0:
             self.last_dropout = nn.Dropout(config.summary_last_dropout)
 

From a7fce6d9176cf3662d153af54270f345eb0bec8d Mon Sep 17 00:00:00 2001
From: Chi-Liang Liu <liangtaiwan1230@gmail.com>
Date: Wed, 24 Jul 2019 16:11:36 +0800
Subject: [PATCH 04/23] fix squad v1 error (na_prob_file should be None)

---
 examples/run_squad.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 36e03fb012..df8e3b4a82 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -241,7 +241,10 @@ def evaluate(args, model, tokenizer, prefix=""):
     # Compute predictions
     output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
     output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
-    output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
+    if args.version_2_with_negative:
+        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
+    else:
+        output_null_log_odds_file = None
 
     if args.model_type in ['xlnet', 'xlm']:
         # XLNet uses a more complex post-processing procedure

From 66b15f73f0caeadadf1c65c6e047ebb4285f1f7a Mon Sep 17 00:00:00 2001
From: rococo // Ron <rococo@tangleroad.com>
Date: Wed, 24 Jul 2019 11:27:08 -0700
Subject: [PATCH 05/23] Update docs for parameter rename

OpenAIGPTLMHeadModel now accepts `labels` instead of `lm_labels`
---
 pytorch_transformers/modeling_openai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 4ea19a965d..17a46fa470 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -538,7 +538,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for language modeling.
-            Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
+            Note that the labels **are shifted** inside the model, i.e. you can set ``labels = input_ids``
             Indices are selected in ``[-1, 0, ..., config.vocab_size]``
             All labels set to ``-1`` are ignored (masked), the loss is only
             computed for labels in ``[0, ..., config.vocab_size]``

From ae152cec09b496101841dcbc59613cc7a3d133a4 Mon Sep 17 00:00:00 2001
From: Joel Grus <joelgrus@gmail.com>
Date: Wed, 24 Jul 2019 16:54:48 -0700
Subject: [PATCH 06/23] make save_pretrained work with added tokens

right now it's dumping the *decoder* when it should be dumping the *encoder*. this fixes that.
---
 pytorch_transformers/tokenization_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index f603a29d74..858edc7c50 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -266,7 +266,7 @@ class PreTrainedTokenizer(object):
 
         with open(added_tokens_file, 'w', encoding='utf-8') as f:
             if self.added_tokens_encoder:
-                out_str = json.dumps(self.added_tokens_decoder, ensure_ascii=False)
+                out_str = json.dumps(self.added_tokens_encoder, ensure_ascii=False)
             else:
                 out_str = u"{}"
             f.write(out_str)

From adb3ef636877586ab64ea9be97f3407433d053d8 Mon Sep 17 00:00:00 2001
From: zijunsun <zijun_sun@shannonai.com>
Date: Thu, 25 Jul 2019 13:09:10 +0800
Subject: [PATCH 07/23] multi-gpu training also should be after apex fp16

---
 examples/run_glue.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 5d9abd06fc..0d4ffaa390 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -92,6 +92,10 @@ def train(args, train_dataset, model, tokenizer):
             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
         model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
 
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
@@ -418,8 +422,6 @@ def main():
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 
     model.to(args.device)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
 
     logger.info("Training/evaluation parameters %s", args)
 

From 35c52f2f3cf85e26a85a7c52cff789983edaa62c Mon Sep 17 00:00:00 2001
From: Sukuya <sukuya@users.noreply.github.com>
Date: Thu, 25 Jul 2019 16:51:11 +0800
Subject: [PATCH 08/23] Update torchscript.rst

Import fixed to pytorch_transformers else torchscript flag can't be used.
---
 docs/source/torchscript.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/torchscript.rst b/docs/source/torchscript.rst
index 1b84559567..e207068fad 100644
--- a/docs/source/torchscript.rst
+++ b/docs/source/torchscript.rst
@@ -74,7 +74,7 @@ according to a ``BertConfig`` class and then saved to disk under the filename ``
 
 .. code-block:: python
 
-    from pytorch_pretrained_bert import BertModel, BertTokenizer, BertConfig
+    from pytorch_transformers import BertModel, BertTokenizer, BertConfig
     import torch
 
     enc = BertTokenizer.from_pretrained("bert-base-uncased")
@@ -129,4 +129,4 @@ Using the traced model for inference is as simple as using its ``__call__`` dund
 
 .. code-block:: python
 
-    traced_model(tokens_tensor, segments_tensors)
\ No newline at end of file
+    traced_model(tokens_tensor, segments_tensors)

From f0aeb7a814289a64a5b22577415a0cfcde3c7870 Mon Sep 17 00:00:00 2001
From: zijunsun <zijun_sun@shannonai.com>
Date: Fri, 26 Jul 2019 15:23:29 +0800
Subject: [PATCH 09/23] =?UTF-8?q?multi-gpu=20training=20also=20should=20be?=
 =?UTF-8?q?=20after=20apex=20fp16=EF=BC=88squad=EF=BC=89?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 examples/run_squad.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 36e03fb012..692cb4a20c 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -101,6 +101,10 @@ def train(args, train_dataset, model, tokenizer):
             raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
         model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
 
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
     # Distributed training (should be after apex fp16 initialization)
     if args.local_rank != -1:
         model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
@@ -457,8 +461,6 @@ def main():
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 
     model.to(args.device)
-    if args.n_gpu > 1:
-        model = torch.nn.DataParallel(model)
 
     logger.info("Training/evaluation parameters %s", args)
 

From edfd965ac8a5446adb2c94ad043263b3144b3f95 Mon Sep 17 00:00:00 2001
From: David Pollack <david@i2x.ai>
Date: Fri, 26 Jul 2019 14:13:46 +0200
Subject: [PATCH 10/23] fix convert_to_tf

---
 .../convert_pytorch_checkpoint_to_tf.py                | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
index b8858ee3dc..a2e7b5c41a 100644
--- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
+++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
@@ -72,11 +72,11 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
         return 'bert/{}'.format(name)
 
     def assign_tf_var(tensor:np.ndarray, name:str):
-        tmp_var = tf.Variable(initial_value=tensor)
-        tf_var = tf.get_variable(dtype=tmp_var.dtype, shape=tmp_var.shape, name=name)
-        op = tf.assign(ref=tf_var, value=tmp_var)
-        session.run(tf.variables_initializer([tmp_var, tf_var]))
-        session.run(fetches=[op, tf_var])
+        tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
+        tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name)
+        session.run(tf.variables_initializer([tf_var]))
+        tf.keras.backend.set_value(tf_var, tensor)
+        session.run(tf_var)
         return tf_var
 
     for var_name in state_dict:

From 09ecf225e9ac00f78ecf9246957128f5d7d79a52 Mon Sep 17 00:00:00 2001
From: David Pollack <david@i2x.ai>
Date: Fri, 26 Jul 2019 15:20:44 +0200
Subject: [PATCH 11/23] fixed the fix.  tf session madness.

---
 .../convert_pytorch_checkpoint_to_tf.py       | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
index a2e7b5c41a..c24dddc4d6 100644
--- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
+++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
@@ -62,34 +62,34 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
     if not os.path.isdir(ckpt_dir):
         os.makedirs(ckpt_dir)
 
-    session = tf.Session()
     state_dict = model.state_dict()
-    tf_vars = []
 
     def to_tf_var_name(name:str):
         for patt, repl in iter(var_map):
             name = name.replace(patt, repl)
         return 'bert/{}'.format(name)
 
-    def assign_tf_var(tensor:np.ndarray, name:str):
+    def create_tf_var(tensor:np.ndarray, name:str, session:tf.Session):
         tf_dtype = tf.dtypes.as_dtype(tensor.dtype)
-        tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name)
+        tf_var = tf.get_variable(dtype=tf_dtype, shape=tensor.shape, name=name, initializer=tf.zeros_initializer())
         session.run(tf.variables_initializer([tf_var]))
-        tf.keras.backend.set_value(tf_var, tensor)
         session.run(tf_var)
         return tf_var
 
-    for var_name in state_dict:
-        tf_name = to_tf_var_name(var_name)
-        torch_tensor = state_dict[var_name].numpy()
-        if any([x in var_name for x in tensors_to_transopse]):
-            torch_tensor = torch_tensor.T
-        tf_tensor = assign_tf_var(tensor=torch_tensor, name=tf_name)
-        tf_vars.append(tf_tensor)
-        print("{0}{1}initialized".format(tf_name, " " * (60 - len(tf_name))))
+    tf.reset_default_graph()
+    with tf.Session() as session:
+        for var_name in state_dict:
+            tf_name = to_tf_var_name(var_name)
+            torch_tensor = state_dict[var_name].numpy()
+            if any([x in var_name for x in tensors_to_transopse]):
+                torch_tensor = torch_tensor.T
+            tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
+            tf.keras.backend.set_value(tf_var, torch_tensor)
+            tf_weight = session.run(tf_var)
+            print("Successfully created {}: {}".format(tf_name, np.allclose(tf_weight, torch_tensor)))
 
-    saver = tf.train.Saver(tf_vars)
-    saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
+        saver = tf.train.Saver(tf.trainable_variables())
+        saver.save(session, os.path.join(ckpt_dir, model_name.replace("-", "_") + ".ckpt"))
 
 
 def main(raw_args=None):

From 632d711411d2126e90cd4657f411a09bc180f561 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 26 Jul 2019 21:14:37 +0200
Subject: [PATCH 12/23] fix #908

---
 pytorch_transformers/__init__.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index f875e4ab18..b4b957192c 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -7,20 +7,20 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization)
 
-from .modeling_bert import (BertConfig, BertModel, BertForPreTraining,
-                       BertForMaskedLM, BertForNextSentencePrediction,
-                       BertForSequenceClassification, BertForMultipleChoice,
-                       BertForTokenClassification, BertForQuestionAnswering,
-                       load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                       BERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
-from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTModel,
+from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
+                            BertForMaskedLM, BertForNextSentencePrediction,
+                            BertForSequenceClassification, BertForMultipleChoice,
+                            BertForTokenClassification, BertForQuestionAnswering,
+                            load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+                            BERT_PRETRAINED_CONFIG_ARCHIVE_MAP)
+from .modeling_openai import (OpenAIGPTConfig, OpenAIGPTPreTrainedModel, OpenAIGPTModel,
                               OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
                               load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
                               OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLModel, TransfoXLLMHeadModel,
+from .modeling_transfo_xl import (TransfoXLConfig, TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
                                   load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                   TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_gpt2 import (GPT2Config, GPT2Model,
+from .modeling_gpt2 import (GPT2Config, GPT2PreTrainedModel, GPT2Model,
                             GPT2LMHeadModel, GPT2DoubleHeadsModel,
                             load_tf_weights_in_gpt2, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
                             GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
@@ -29,7 +29,7 @@ from .modeling_xlnet import (XLNetConfig,
                              XLNetForSequenceClassification, XLNetForQuestionAnswering,
                              load_tf_weights_in_xlnet, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
                              XLNET_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_xlm import (XLMConfig, XLMModel,
+from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
                            XLMWithLMHeadModel, XLMForSequenceClassification,
                            XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)

From 7b6e474c9acc26962363e78ef95fdb6f006eb0b4 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 26 Jul 2019 21:26:44 +0200
Subject: [PATCH 13/23] fix #901

---
 pytorch_transformers/tokenization_utils.py | 28 ++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 858edc7c50..e2fe46320e 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -160,26 +160,46 @@ class PreTrainedTokenizer(object):
         s3_models = list(cls.max_model_input_sizes.keys())
         vocab_files = {}
         if pretrained_model_name_or_path in s3_models:
+            # Get the vocabulary from AWS S3 bucket
             for file_id, map_list in cls.pretrained_vocab_files_map.items():
                 vocab_files[file_id] = map_list[pretrained_model_name_or_path]
         else:
+            # Get the vocabulary from local files
             logger.info(
                 "Model name '{}' not found in model shortcut name list ({}). "
                 "Assuming '{}' is a path or url to a directory containing tokenizer files.".format(
                     pretrained_model_name_or_path, ', '.join(s3_models),
                     pretrained_model_name_or_path))
-            all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
-                                     'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
-            all_vocab_files_names.update(cls.vocab_files_names)
-            for file_id, file_name in all_vocab_files_names.items():
+
+            # Look for the tokenizer main vocabulary files
+            for file_id, file_name in cls.vocab_files_names.items():
                 if os.path.isdir(pretrained_model_name_or_path):
+                    # If a directory is provided we look for the standard filenames
                     full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
                 else:
+                    # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
                     full_file_name = pretrained_model_name_or_path
                 if not os.path.exists(full_file_name):
                     logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
                     full_file_name = None
                 vocab_files[file_id] = full_file_name
+
+            # Look for the additional tokens files
+            all_vocab_files_names = {'added_tokens_file': ADDED_TOKENS_FILE,
+                                     'special_tokens_map_file': SPECIAL_TOKENS_MAP_FILE}
+
+            # If a path to a file was provided, get the parent directory
+            saved_directory = pretrained_model_name_or_path
+            if os.path.exists(saved_directory) and not os.path.isdir(saved_directory):
+                saved_directory = os.path.dirname(saved_directory)
+
+            for file_id, file_name in all_vocab_files_names.items():
+                full_file_name = os.path.join(saved_directory, file_name)
+                if not os.path.exists(full_file_name):
+                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                    full_file_name = None
+                vocab_files[file_id] = full_file_name
+
             if all(full_file_name is None for full_file_name in vocab_files.values()):
                 logger.error(
                     "Model name '{}' was not found in model name list ({}). "

From c90119e5430954abc9e852dd334d90d3ca906eb1 Mon Sep 17 00:00:00 2001
From: David Pollack <david@i2x.ai>
Date: Mon, 29 Jul 2019 16:56:02 +0200
Subject: [PATCH 14/23] spelling mistake

---
 pytorch_transformers/convert_pytorch_checkpoint_to_tf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
index c24dddc4d6..025c2f396c 100644
--- a/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
+++ b/pytorch_transformers/convert_pytorch_checkpoint_to_tf.py
@@ -41,7 +41,7 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
         N BertForQuestionAnswering
     """
 
-    tensors_to_transopse = (
+    tensors_to_transpose = (
         "dense.weight",
         "attention.self.query",
         "attention.self.key",
@@ -81,7 +81,7 @@ def convert_pytorch_checkpoint_to_tf(model:BertModel, ckpt_dir:str, model_name:s
         for var_name in state_dict:
             tf_name = to_tf_var_name(var_name)
             torch_tensor = state_dict[var_name].numpy()
-            if any([x in var_name for x in tensors_to_transopse]):
+            if any([x in var_name for x in tensors_to_transpose]):
                 torch_tensor = torch_tensor.T
             tf_var = create_tf_var(tensor=torch_tensor, name=tf_name, session=session)
             tf.keras.backend.set_value(tf_var, torch_tensor)

From 769bb643ce4e6d5836b41b41430ce02473907ec8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gr=C3=A9gory=20Ch=C3=A2tel?= <chatel.gregory@gmail.com>
Date: Wed, 31 Jul 2019 16:17:15 +0200
Subject: [PATCH 15/23] Fixing a broken link.

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8e2074f727..a4905e5854 100644
--- a/README.md
+++ b/README.md
@@ -18,7 +18,7 @@ These implementations have been tested on several datasets (see the example scri
 | Section | Description |
 |-|-|
 | [Installation](#installation) | How to install the package |
-| [Quick tour: Usage](#quick-tour-usage) | Tokenizers & models usage: Bert and GPT-2 |
+| [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
 | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
 | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-pytorch-transformers) | Migrating your code from pytorch-pretrained-bert to pytorch-transformers |
 | [Documentation](https://huggingface.co/pytorch-transformers/) | Full API documentation and more |

From 97091acb8c5bd192a354375e58352694007b2390 Mon Sep 17 00:00:00 2001
From: Pierric Cistac <Pierrci@users.noreply.github.com>
Date: Wed, 31 Jul 2019 10:37:56 -0400
Subject: [PATCH 16/23] Small spelling fix

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a4905e5854..54c0ac94a1 100644
--- a/README.md
+++ b/README.md
@@ -283,7 +283,7 @@ Here is a quick summary of what you should take care of when migrating from `pyt
 
 The main breaking change when migrating from `pytorch-pretrained-bert` to `pytorch-transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
 
-The exact content of the tuples for each model are detailled in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
+The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/pytorch-transformers/).
 
 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
 

From f2a3eb987e1fc2c85320fc3849c67811f5736b50 Mon Sep 17 00:00:00 2001
From: Anthony MOI <m.anthony.moi@gmail.com>
Date: Wed, 31 Jul 2019 11:05:06 -0400
Subject: [PATCH 17/23] Fix small typos

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 54c0ac94a1..7365e02a09 100644
--- a/README.md
+++ b/README.md
@@ -194,7 +194,7 @@ python ./examples/run_glue.py \
     --warmup_steps=120
 ```
 
-On this machine we thus have a batch size of 32, please increase `gradient_accumulation_steps` to reach the same batch size if you have a smaller machine. These hyper-parameters should results in a Pearson correlation coefficient of `+0.917` on the development set.
+On this machine we thus have a batch size of 32, please increase `gradient_accumulation_steps` to reach the same batch size if you have a smaller machine. These hyper-parameters should result in a Pearson correlation coefficient of `+0.917` on the development set.
 
 #### Fine-tuning Bert model on the MRPC classification task
 
@@ -264,7 +264,7 @@ This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-s
 ### `run_generation.py`: Text generation with GPT, GPT-2, Transformer-XL and XLNet
 
 A conditional generation script is also included to generate text from a prompt.
-The generation script include the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by by Aman Rusia to get high quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
+The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by by Aman Rusia to get high quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
 
 Here is how to run the script with the small version of OpenAI GPT-2 model:
 

From 44dd941efb602433b7edc29612cbdd0a03bf14dc Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 31 Jul 2019 21:09:04 -0400
Subject: [PATCH 18/23] link to `swift-coreml-transformers`

---
 README.md                    | 10 ++++++++++
 docs/source/installation.rst | 13 +++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/README.md b/README.md
index 7365e02a09..703eb47df9 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,16 @@ python -m pytest -sv ./pytorch_transformers/tests/
 python -m pytest -sv ./examples/
 ```
 
+### Do you want to run a Transformer model on a mobile device?
+
+You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
+
+It contains an example of a conversion script from a Pytorch trained Transformer model (here, `GPT-2`) to a CoreML model that runs on iOS devices.
+
+At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
+or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
+
+
 ## Quick tour
 
 Let's do a very quick overview of PyTorch-Transformers. Detailed examples for each model architecture (Bert, GPT, GPT-2, Transformer-XL, XLNet and XLM) can be found in the [full documentation](https://huggingface.co/pytorch-transformers/).
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
index f8beb9f1c8..3a4663da0b 100644
--- a/docs/source/installation.rst
+++ b/docs/source/installation.rst
@@ -50,3 +50,16 @@ If you want to reproduce the original tokenization process of the ``OpenAI GPT``
    python -m spacy download en
 
 If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
+
+
+Do you want to run a Transformer model on a mobile device?
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+You should check out our `swift-coreml-transformers <https://github.com/huggingface/swift-coreml-transformers>`_ repo.
+
+It contains an example of a conversion script from a Pytorch trained Transformer model (here, ``GPT-2``) to a CoreML model that runs on iOS devices.
+
+It also contains an implementation of BERT for Question answering.
+
+At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
+or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!

From 7c524d631e4c0fd0531d02d6a155fc95a3e90810 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 16:25:54 +0200
Subject: [PATCH 19/23] add issue templates

---
 .github/ISSUE_TEMPLATE/bug-report.md      | 36 +++++++++++++++++++
 .github/ISSUE_TEMPLATE/feature-request.md | 16 +++++++++
 .github/ISSUE_TEMPLATE/migration.md       | 43 +++++++++++++++++++++++
 .github/ISSUE_TEMPLATE/question-help.md   |  8 +++++
 4 files changed, 103 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug-report.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature-request.md
 create mode 100644 .github/ISSUE_TEMPLATE/migration.md
 create mode 100644 .github/ISSUE_TEMPLATE/question-help.md

diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
new file mode 100644
index 0000000000..0d9439887b
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -0,0 +1,36 @@
+---
+name: "\U0001F41B Bug Report"
+about: Submit a bug report to help us improve PyTorch Transformers
+---
+
+## 🐛 Bug
+
+<!-- A clear and concise description of what the bug is. -->
+
+## To Reproduce
+
+Steps to reproduce the behavior:
+
+1.
+2.
+3.
+
+<!-- If you have a code sample, error messages, stack traces, please provide it here as well. -->
+
+## Expected behavior
+
+<!-- A clear and concise description of what you expected to happen. -->
+
+## Environment
+
+* OS:
+* Python version:
+* PyTorch version:
+* PyTorch Transformers version (or branch):
+* Using GPU ?
+* Distributed of parallel setup ?
+* Any other relevant information:
+
+## Additional context
+
+<!-- Add any other context about the problem here. -->
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md
new file mode 100644
index 0000000000..828e3737be
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature-request.md
@@ -0,0 +1,16 @@
+---
+name: "\U0001F680 Feature Request"
+about: Submit a proposal/request for a new PyTorch Transformers feature
+---
+
+## 🚀 Feature
+
+<!-- A clear and concise description of the feature proposal. Please provide a link to the paper and code in case they exist. -->
+
+## Motivation
+
+<!-- Please outline the motivation for the proposal. Is your feature request related to a problem? e.g., I'm always frustrated when [...]. If this is related to another GitHub issue, please link here too. -->
+
+## Additional context
+
+<!-- Add any other context or screenshots about the feature request here. -->
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/migration.md b/.github/ISSUE_TEMPLATE/migration.md
new file mode 100644
index 0000000000..9a8b19dffa
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/migration.md
@@ -0,0 +1,43 @@
+---
+name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
+about: Report a problem when migrating from PyTorch-pretrained-Bert to PyTorch-Transformers
+---
+
+## 📚 Migration
+
+<!-- Give at least the following information -->
+
+Model I am using (Bert, XLNet....):
+
+The problem arise when using:
+* [ ] the official example scripts
+* [ ] my own modified scripts
+
+The tasks I am working on is:
+* [ ] an official GLUE/SQUaD task: (give the name)
+* [ ] my own task or dataset: (give details)
+
+Language I am using the model on (English, Chinese....):
+
+Details of the issue:
+
+<!-- A clear and concise description of the migration issue. If you have code snippets, please provide it here as well. -->
+
+## Environment
+
+* OS:
+* Python version:
+* PyTorch version:
+* PyTorch Transformers version (or branch):
+* Using GPU ?
+* Distributed of parallel setup ?
+* Any other relevant information:
+
+## Checklist
+
+- [ ] I have read the migration guide in the readme.
+- [ ] I checked if a related official extension example runs on my machine.
+
+## Additional context
+
+<!-- Add any other context about the problem here. -->
\ No newline at end of file
diff --git a/.github/ISSUE_TEMPLATE/question-help.md b/.github/ISSUE_TEMPLATE/question-help.md
new file mode 100644
index 0000000000..8c76994b02
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/question-help.md
@@ -0,0 +1,8 @@
+---
+name: "❓Questions & Help"
+about: Start a general discussion related to PyTorch Transformers
+---
+
+## ❓ Questions & Help
+
+<!-- A clear and concise description of the question. -->
\ No newline at end of file

From 077ad693e9c3b5702ba9874f7a0f0ed8099c9773 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 16:46:29 +0200
Subject: [PATCH 20/23] tweak issue templates wordings

---
 .github/ISSUE_TEMPLATE/bug-report.md | 14 +++++++++++++-
 .github/ISSUE_TEMPLATE/migration.md  | 10 +++++-----
 2 files changed, 18 insertions(+), 6 deletions(-)

diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
index 0d9439887b..66f7831aea 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -5,7 +5,19 @@ about: Submit a bug report to help us improve PyTorch Transformers
 
 ## 🐛 Bug
 
-<!-- A clear and concise description of what the bug is. -->
+<!-- Important information -->
+
+Model I am using (Bert, XLNet....):
+
+Language I am using the model on (English, Chinese....):
+
+The problem arise when using:
+* [ ] the official example scripts: (give details)
+* [ ] my own modified scripts: (give details)
+
+The tasks I am working on is:
+* [ ] an official GLUE/SQUaD task: (give the name)
+* [ ] my own task or dataset: (give details)
 
 ## To Reproduce
 
diff --git a/.github/ISSUE_TEMPLATE/migration.md b/.github/ISSUE_TEMPLATE/migration.md
index 9a8b19dffa..cf0c9a4757 100644
--- a/.github/ISSUE_TEMPLATE/migration.md
+++ b/.github/ISSUE_TEMPLATE/migration.md
@@ -5,20 +5,20 @@ about: Report a problem when migrating from PyTorch-pretrained-Bert to PyTorch-T
 
 ## 📚 Migration
 
-<!-- Give at least the following information -->
+<!-- Important information -->
 
 Model I am using (Bert, XLNet....):
 
+Language I am using the model on (English, Chinese....):
+
 The problem arise when using:
-* [ ] the official example scripts
-* [ ] my own modified scripts
+* [ ] the official example scripts: (give details)
+* [ ] my own modified scripts: (give details)
 
 The tasks I am working on is:
 * [ ] an official GLUE/SQUaD task: (give the name)
 * [ ] my own task or dataset: (give details)
 
-Language I am using the model on (English, Chinese....):
-
 Details of the issue:
 
 <!-- A clear and concise description of the migration issue. If you have code snippets, please provide it here as well. -->

From 70c10caa06d9feda3f446d0a82655f56cd2afdab Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 17:09:37 +0200
Subject: [PATCH 21/23] add option mentioned in #940

---
 examples/run_glue.py  | 6 ++++++
 examples/run_squad.py | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 0d4ffaa390..a939ea373b 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -247,6 +247,9 @@ def evaluate(args, model, tokenizer, prefix=""):
 
 
 def load_and_cache_examples(args, task, tokenizer, evaluate=False):
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
     processor = processors[task]()
     output_mode = output_modes[task]
     # Load data features from cache or dataset file
@@ -273,6 +276,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
 
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
     # Convert to Tensors and build dataset
     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
     all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 7d768d2c43..e62a1f1ff3 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -272,6 +272,9 @@ def evaluate(args, model, tokenizer, prefix=""):
 
 
 def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
     # Load data features from cache or dataset file
     input_file = args.predict_file if evaluate else args.train_file
     cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
@@ -296,6 +299,9 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)
 
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
     # Convert to Tensors and build dataset
     all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
     all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)

From 7223886dc944b5476ea6be1a9838738644a2e9a1 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 17:16:56 +0200
Subject: [PATCH 22/23] fix #944

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 703eb47df9..f3d2865ba8 100644
--- a/README.md
+++ b/README.md
@@ -385,6 +385,7 @@ for batch in train_data:
     loss.backward()
     scheduler.step()
     optimizer.step()
+    optimizer.zero_grad()
 ```
 
 ## Citation

From 3a126e73dd020be851d59cfcdc741fe3e8c6ad4f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 5 Aug 2019 17:26:29 +0200
Subject: [PATCH 23/23] fix #950

---
 .../convert_transfo_xl_checkpoint_to_pytorch.py          | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
index b6672aedf7..5733146444 100755
--- a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -24,11 +24,10 @@ from io import open
 import torch
 
 import pytorch_transformers.tokenization_transfo_xl as data_utils
-from pytorch_transformers.modeling_transfo_xl import (CONFIG_NAME,
-                                                         WEIGHTS_NAME,
-                                                         TransfoXLConfig,
-                                                         TransfoXLLMHeadModel,
-                                                         load_tf_weights_in_transfo_xl)
+
+from pytorch_transformers import CONFIG_NAME, WEIGHTS_NAME
+from pytorch_transformers.modeling_transfo_xl import (TransfoXLConfig, TransfoXLLMHeadModel,
+                                                      load_tf_weights_in_transfo_xl)
 from pytorch_transformers.tokenization_transfo_xl import (CORPUS_NAME, VOCAB_FILES_NAMES)
 
 if sys.version_info[0] == 2: