From c4e9615691a19128f446563718355aedf03cf01b Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wechi@microsoft.com>
Date: Wed, 17 Jul 2019 09:08:40 -0700
Subject: [PATCH 01/36] Fix a path so that test can run on Windows

---
 pytorch_transformers/tests/modeling_common_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py
index 5ea98d68e2..e974ae865d 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -21,6 +21,7 @@ import os
 import shutil
 import json
 import random
+import uuid
 
 import unittest
 import logging
@@ -527,7 +528,7 @@ class ConfigTester(object):
 
     def create_and_test_config_to_json_file(self):
         config_first = self.config_class(**self.inputs_dict)
-        json_file_path = "/tmp/config.json"
+        json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
         config_first.to_json_file(json_file_path)
         config_second = self.config_class.from_json_file(json_file_path)
         os.remove(json_file_path)

From ba4bce2581f9a67caa44c3cc959a2dacb0090670 Mon Sep 17 00:00:00 2001
From: tuvuumass <tuvu@cs.umass.edu>
Date: Tue, 13 Aug 2019 11:26:27 -0400
Subject: [PATCH 02/36] fix issue #824

---
 examples/run_bertology.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/examples/run_bertology.py b/examples/run_bertology.py
index 61c7440ecb..f11b73b54f 100644
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -211,10 +211,12 @@ def prune_heads(args, model, eval_dataloader, head_mask):
 
 def main():
     parser = argparse.ArgumentParser()
+    ## Required parameters
     parser.add_argument("--data_dir", default=None, type=str, required=True,
                         help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_name", default=None, type=str, required=True,
-                        help="Bert/XLNet/XLM pre-trained model selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
+                            ALL_MODELS))
     parser.add_argument("--task_name", default=None, type=str, required=True,
                         help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
     parser.add_argument("--output_dir", default=None, type=str, required=True,
@@ -222,9 +224,9 @@ def main():
 
     ## Other parameters
     parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
+                        help="Pretrained config name or path if not the same as model_name_or_path")
     parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
+                        help="Pretrained tokenizer name or path if not the same as model_name_or_path")
     parser.add_argument("--cache_dir", default="", type=str,
                         help="Where do you want to store the pre-trained models downloaded from s3")
     parser.add_argument("--data_subset", type=int, default=-1,
@@ -297,15 +299,15 @@ def main():
 
     args.model_type = ""
     for key in MODEL_CLASSES:
-        if key in args.model_name.lower():
+        if key in args.model_name_or_path.lower():
             args.model_type = key  # take the first match in model types
             break
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name,
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
                                           num_labels=num_labels, finetuning_task=args.task_name,
                                           output_attentions=True)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name)
-    model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

From 9ce36e3e4b0b17dd6df05e13e563570677cda39e Mon Sep 17 00:00:00 2001
From: samvelyan <mika.samvelyan@gmail.com>
Date: Wed, 14 Aug 2019 08:57:09 +0000
Subject: [PATCH 03/36] Re-implemented tokenize() iteratively in
 PreTrainedTokenizer.

---
 pytorch_transformers/tokenization_utils.py | 42 ++++++++++++++++++----
 1 file changed, 36 insertions(+), 6 deletions(-)

diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 2e75c83bfb..bdeeeb4877 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -428,7 +428,7 @@ class PreTrainedTokenizer(object):
 
             Parameters:
                 special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``].
-                
+
                     Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
 
             Returns:
@@ -472,15 +472,45 @@ class PreTrainedTokenizer(object):
 
             Take care of added tokens.
         """
+        def split_on_token(tok, text):
+            result = []
+            split_text = text.split(tok)
+            for i, sub_text in enumerate(split_text):
+                sub_text = sub_text.strip()
+                if i == 0 and not sub_text:
+                    result += [tok]
+                elif i == len(split_text) - 1:
+                    if sub_text:
+                        result += [sub_text]
+                    else:
+                        pass
+                else:
+                    if sub_text:
+                        result += [sub_text]
+                    result += [tok]
+            return result
+
         def split_on_tokens(tok_list, text):
             if not text:
                 return []
             if not tok_list:
                 return self._tokenize(text, **kwargs)
-            tok = tok_list[0]
-            split_text = text.split(tok)
-            return sum((split_on_tokens(tok_list[1:], sub_text.strip()) + [tok] \
-                        for sub_text in split_text), [])[:-1]
+
+            tokenized_text = []
+            text_list = [text]
+            for tok in tok_list:
+                tokenized_text = []
+                for sub_text in text_list:
+                    if sub_text not in self.added_tokens_encoder \
+                            and sub_text not in self.all_special_tokens:
+                        tokenized_text += split_on_token(tok, sub_text)
+                    else:
+                        tokenized_text += [sub_text]
+                text_list = tokenized_text
+
+            return sum((self._tokenize(token, **kwargs) if token not \
+                    in self.added_tokens_encoder and token not in self.all_special_tokens \
+                    else [token] for token in tokenized_text), [])
 
         added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
         tokenized_text = split_on_tokens(added_tokens, text)
@@ -522,7 +552,7 @@ class PreTrainedTokenizer(object):
 
     def encode(self, text):
         """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
-        
+
         Same doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
         """
         return self.convert_tokens_to_ids(self.tokenize(text))

From b8ff56896ccbd27a54035a90a3bc278a44541a74 Mon Sep 17 00:00:00 2001
From: wangfei <1140554608@qq.com>
Date: Fri, 16 Aug 2019 12:11:05 +0800
Subject: [PATCH 04/36] Fix bug of multi-gpu training in lm finetuning

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 2 +-
 examples/lm_finetuning/simple_lm_finetuning.py     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 9fcc5f2cb1..7c40342f18 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -320,7 +320,7 @@ def main():
                     global_step += 1
 
     # Save a trained model
-    if  n_gpu > 1 and torch.distributed.get_rank() == 0  or n_gpu <=1 :
+    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
         logging.info("** ** * Saving fine-tuned model ** ** * ")
         model.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index ba5f832827..25333de0ed 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -507,7 +507,7 @@ def main():
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    if not os.path.exists(args.output_dir) and ( n_gpu > 1 and torch.distributed.get_rank() == 0  or n_gpu <=1 ):
+    if not os.path.exists(args.output_dir) and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         os.makedirs(args.output_dir)
 
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
@@ -608,7 +608,7 @@ def main():
                     global_step += 1
 
         # Save a trained model
-        if args.do_train and ( n_gpu > 1 and torch.distributed.get_rank() == 0  or n_gpu <=1):
+        if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
             logger.info("** ** * Saving fine - tuned model ** ** * ")
             model.save_pretrained(args.output_dir)
             tokenizer.save_pretrained(args.output_dir)

From 7e7fc53da5f230db379ece739457c81b2f50f13e Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 16 Aug 2019 11:02:10 -0400
Subject: [PATCH 05/36] Fixing run_glue example with RoBERTa

---
 examples/run_glue.py   | 2 +-
 examples/utils_glue.py | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index c0f70e0863..7fb0732e61 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -279,7 +279,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             sep_token=tokenizer.sep_token,
             sep_token_extra=bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
             pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-            pad_token=tokenizer.encoder[tokenizer.pad_token] if args.model_type in ['roberta'] else tokenizer.vocab[tokenizer.pad_token],
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
             pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
         )
         if args.local_rank in [-1, 0]:
diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index c955e4d0ce..e1649fa5af 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -425,9 +425,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
             # Account for [CLS], [SEP], [SEP] with "- 3"
             _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
         else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[:(max_seq_length - 2)]
+            # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
+            special_tokens_count = 3 if sep_token_extra else 2
+            if len(tokens_a) > max_seq_length - special_tokens_count:
+                tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
 
         # The convention in BERT is:
         # (a) For sequence pairs:

From d8923270e6c497862f990a3c72e40cc1ddd01d4e Mon Sep 17 00:00:00 2001
From: Jason Phang <email@jasonphang.com>
Date: Fri, 16 Aug 2019 15:58:19 -0400
Subject: [PATCH 06/36] Correct truncation for RoBERTa in 2-input GLUE

---
 examples/utils_glue.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index e1649fa5af..3e3f104672 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -422,8 +422,9 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
             tokens_b = tokenizer.tokenize(example.text_b)
             # Modifies `tokens_a` and `tokens_b` in place so that the total
             # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+            # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
+            special_tokens_count = 4 if sep_token_extra else 3
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
         else:
             # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
             special_tokens_count = 3 if sep_token_extra else 2

From 189ff9b66408a1758f3732725db3871322f3e0e6 Mon Sep 17 00:00:00 2001
From: Christophe Bourguignat <christophe.bourguignat@zelros.com>
Date: Sat, 17 Aug 2019 18:46:50 +0200
Subject: [PATCH 07/36] Update README after RoBERTa addition

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3389e10593..7d2445fc11 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,7 @@ import torch
 from pytorch_transformers import *
 
 # PyTorch-Transformers has a unified API
-# for 6 transformer architectures and 27 pretrained weights.
+# for 7 transformer architectures and 30 pretrained weights.
 #          Model          | Tokenizer          | Pretrained weights shortcut
 MODELS = [(BertModel,       BertTokenizer,      'bert-base-uncased'),
           (OpenAIGPTModel,  OpenAIGPTTokenizer, 'openai-gpt'),

From 00e9c4cc9616cab1666cab0a331b5d7e68946928 Mon Sep 17 00:00:00 2001
From: wangfei <1140554608@qq.com>
Date: Sun, 18 Aug 2019 11:02:02 +0800
Subject: [PATCH 08/36] Fix: save model/model.module

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 11 ++++++-----
 examples/lm_finetuning/simple_lm_finetuning.py     |  3 ++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 7c40342f18..1177d84cd4 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -155,12 +155,12 @@ def main():
                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                         "0 (default value): dynamic loss scaling.\n"
                         "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument("--warmup_steps", 
-                        default=0, 
+    parser.add_argument("--warmup_steps",
+                        default=0,
                         type=int,
                         help="Linear warmup over warmup_steps.")
-    parser.add_argument("--adam_epsilon", 
-                        default=1e-8, 
+    parser.add_argument("--adam_epsilon",
+                        default=1e-8,
                         type=float,
                         help="Epsilon for Adam optimizer.")
     parser.add_argument("--learning_rate",
@@ -322,7 +322,8 @@ def main():
     # Save a trained model
     if args.local_rank == -1 or torch.distributed.get_rank() == 0:
         logging.info("** ** * Saving fine-tuned model ** ** * ")
-        model.save_pretrained(args.output_dir)
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
 
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index 25333de0ed..9633640faf 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -610,7 +610,8 @@ def main():
         # Save a trained model
         if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
             logger.info("** ** * Saving fine - tuned model ** ** * ")
-            model.save_pretrained(args.output_dir)
+            model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+            model_to_save.save_pretrained(args.output_dir)
             tokenizer.save_pretrained(args.output_dir)
 
 

From 1ef41b83374ce5756e24746201d21432d7ecada0 Mon Sep 17 00:00:00 2001
From: wangfei <1140554608@qq.com>
Date: Sun, 18 Aug 2019 11:03:12 +0800
Subject: [PATCH 09/36] Revert "Fix: save model/model.module"

This reverts commit 00e9c4cc9616cab1666cab0a331b5d7e68946928.
---
 examples/lm_finetuning/finetune_on_pregenerated.py | 11 +++++------
 examples/lm_finetuning/simple_lm_finetuning.py     |  3 +--
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 1177d84cd4..7c40342f18 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -155,12 +155,12 @@ def main():
                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                         "0 (default value): dynamic loss scaling.\n"
                         "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument("--warmup_steps",
-                        default=0,
+    parser.add_argument("--warmup_steps", 
+                        default=0, 
                         type=int,
                         help="Linear warmup over warmup_steps.")
-    parser.add_argument("--adam_epsilon",
-                        default=1e-8,
+    parser.add_argument("--adam_epsilon", 
+                        default=1e-8, 
                         type=float,
                         help="Epsilon for Adam optimizer.")
     parser.add_argument("--learning_rate",
@@ -322,8 +322,7 @@ def main():
     # Save a trained model
     if args.local_rank == -1 or torch.distributed.get_rank() == 0:
         logging.info("** ** * Saving fine-tuned model ** ** * ")
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
+        model.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
 
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index 9633640faf..25333de0ed 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -610,8 +610,7 @@ def main():
         # Save a trained model
         if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
             logger.info("** ** * Saving fine - tuned model ** ** * ")
-            model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
-            model_to_save.save_pretrained(args.output_dir)
+            model.save_pretrained(args.output_dir)
             tokenizer.save_pretrained(args.output_dir)
 
 

From 856a63da4d1f0f302633dc73e2d4a1f698bbafda Mon Sep 17 00:00:00 2001
From: wangfei <1140554608@qq.com>
Date: Sun, 18 Aug 2019 11:03:47 +0800
Subject: [PATCH 10/36] Fix: save model/model.module

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 3 ++-
 examples/lm_finetuning/simple_lm_finetuning.py     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 7c40342f18..eefa56c824 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -322,7 +322,8 @@ def main():
     # Save a trained model
     if args.local_rank == -1 or torch.distributed.get_rank() == 0:
         logging.info("** ** * Saving fine-tuned model ** ** * ")
-        model.save_pretrained(args.output_dir)
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
 
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index 25333de0ed..9633640faf 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -610,7 +610,8 @@ def main():
         # Save a trained model
         if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
             logger.info("** ** * Saving fine - tuned model ** ** * ")
-            model.save_pretrained(args.output_dir)
+            model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+            model_to_save.save_pretrained(args.output_dir)
             tokenizer.save_pretrained(args.output_dir)
 
 

From 40acf6b52a5250608c2b90edd955835131971d5a Mon Sep 17 00:00:00 2001
From: Chi-Liang Liu <liangtaiwan1230@gmail.com>
Date: Tue, 30 Jul 2019 18:37:37 +0800
Subject: [PATCH 11/36] don't save model without training

---
 examples/run_squad.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index f0ae9169ad..f2d29fd6b1 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -481,7 +481,7 @@ def main():
 
 
     # Save the trained model and the tokenizer
-    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+    if args.do_train and args.local_rank == -1 or torch.distributed.get_rank() == 0:
         # Create output directory if needed
         if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
             os.makedirs(args.output_dir)

From c589862b783b94a8408b40c6dc9bf4a14b2ee391 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 19 Aug 2019 10:17:47 -0400
Subject: [PATCH 12/36] Doc: loading from config alone does not load the model
 weights

---
 pytorch_transformers/modeling_bert.py       | 4 +++-
 pytorch_transformers/modeling_gpt2.py       | 2 ++
 pytorch_transformers/modeling_openai.py     | 2 ++
 pytorch_transformers/modeling_roberta.py    | 3 ++-
 pytorch_transformers/modeling_transfo_xl.py | 2 ++
 pytorch_transformers/modeling_utils.py      | 4 ++++
 pytorch_transformers/modeling_xlm.py        | 2 ++
 pytorch_transformers/modeling_xlnet.py      | 2 ++
 8 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 51d8788545..9c20eac9bf 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -577,7 +577,9 @@ BERT_START_DOCSTRING = r"""    The BERT model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 BERT_INPUTS_DOCSTRING = r"""
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 5211def3e3..f67d0e88d5 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -383,6 +383,8 @@ GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
 
     Parameters:
         config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 GPT2_INPUTS_DOCSTRING = r"""    Inputs:
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 364923b0af..e8648487be 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -397,6 +397,8 @@ OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
 
     Parameters:
         config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py
index adb04b4b3a..e3065cf60b 100644
--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -90,7 +90,8 @@ ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
 
     Parameters:
         config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the 
-            model.
+            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 ROBERTA_INPUTS_DOCSTRING = r"""
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index cb5416964c..553a71fffe 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -928,6 +928,8 @@ TRANSFO_XL_START_DOCSTRING = r"""    The Transformer-XL model was proposed in
 
     Parameters:
         config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 TRANSFO_XL_INPUTS_DOCSTRING = r"""
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 35f82e324f..edc6b3903e 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -71,6 +71,10 @@ class PretrainedConfig(object):
     r""" Base class for all configuration classes.
         Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
 
+        Note:
+            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
+            It only affects the model's configuration.
+
         Class attributes (overridden by derived classes):
             - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
 
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 941c8dda2f..d01d245bbb 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -416,6 +416,8 @@ XLM_START_DOCSTRING = r"""    The XLM model was proposed in
 
     Parameters:
         config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 XLM_INPUTS_DOCSTRING = r"""
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index e9e75e3ab7..af33c5a6c2 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -647,6 +647,8 @@ XLNET_START_DOCSTRING = r"""    The XLNet model was proposed in
 
     Parameters:
         config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 XLNET_INPUTS_DOCSTRING = r"""

From a368b877911862da014ed7b219679effbb8dd8ca Mon Sep 17 00:00:00 2001
From: Peng Qi <qipeng@users.noreply.github.com>
Date: Mon, 19 Aug 2019 13:07:00 -0700
Subject: [PATCH 13/36] Fix #1015

---
 examples/run_squad.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index f2d29fd6b1..efa835107c 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -498,7 +498,7 @@ def main():
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         model.to(args.device)
 
 

From 28f7ca1f807f0857c24f18c0b28b6b8ebee18c0a Mon Sep 17 00:00:00 2001
From: Zeyao Du <ned1991@gmail.com>
Date: Tue, 20 Aug 2019 15:58:42 +0800
Subject: [PATCH 14/36] swap optimizer.step and scheduler.step

---
 examples/lm_finetuning/simple_lm_finetuning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index ba5f832827..dca883d2f6 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -602,8 +602,8 @@ def main():
                 nb_tr_examples += input_ids.size(0)
                 nb_tr_steps += 1
                 if (step + 1) % args.gradient_accumulation_steps == 0:
-                    scheduler.step()  # Update learning rate schedule
                     optimizer.step()
+                    scheduler.step()  # Update learning rate schedule
                     optimizer.zero_grad()
                     global_step += 1
 

From a1359b970cb4bfa41008a45b44dd2a25e579bff3 Mon Sep 17 00:00:00 2001
From: Zeyao Du <ned1991@gmail.com>
Date: Tue, 20 Aug 2019 16:00:07 +0800
Subject: [PATCH 15/36] Update finetune_on_pregenerated.py

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 9fcc5f2cb1..ccf1c15313 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -314,8 +314,8 @@ def main():
                 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                 pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                 if (step + 1) % args.gradient_accumulation_steps == 0:
-                    scheduler.step()  # Update learning rate schedule
                     optimizer.step()
+                    scheduler.step()  # Update learning rate schedule
                     optimizer.zero_grad()
                     global_step += 1
 

From 45ab8bf60e5c2af912006035f5568be92c0c99c9 Mon Sep 17 00:00:00 2001
From: Duzeyao <330501241@qq.com>
Date: Tue, 20 Aug 2019 16:40:39 +0800
Subject: [PATCH 16/36] Revert "Update finetune_on_pregenerated.py"

This reverts commit a1359b970cb4bfa41008a45b44dd2a25e579bff3.
---
 examples/lm_finetuning/finetune_on_pregenerated.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index ccf1c15313..9fcc5f2cb1 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -314,8 +314,8 @@ def main():
                 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                 pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                 if (step + 1) % args.gradient_accumulation_steps == 0:
-                    optimizer.step()
                     scheduler.step()  # Update learning rate schedule
+                    optimizer.step()
                     optimizer.zero_grad()
                     global_step += 1
 

From d86b49ac86141810af4a7c82ed34e789b3b1937e Mon Sep 17 00:00:00 2001
From: Duzeyao <330501241@qq.com>
Date: Tue, 20 Aug 2019 16:46:34 +0800
Subject: [PATCH 17/36] swap optimizer.step and scheduler.step

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 9fcc5f2cb1..ccf1c15313 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -314,8 +314,8 @@ def main():
                 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                 pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                 if (step + 1) % args.gradient_accumulation_steps == 0:
-                    scheduler.step()  # Update learning rate schedule
                     optimizer.step()
+                    scheduler.step()  # Update learning rate schedule
                     optimizer.zero_grad()
                     global_step += 1
 

From fecaed0ed4bf338bca5b9895107b309841f8ac57 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 20 Aug 2019 10:56:12 +0200
Subject: [PATCH 18/36] add force_download option to from_pretrained methods

---
 pytorch_transformers/file_utils.py         | 13 ++++++++-----
 pytorch_transformers/modeling_utils.py     | 13 +++++++++++--
 pytorch_transformers/tokenization_utils.py |  6 +++++-
 3 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/pytorch_transformers/file_utils.py b/pytorch_transformers/file_utils.py
index 75c075720c..074e6743ef 100644
--- a/pytorch_transformers/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -93,12 +93,15 @@ def filename_to_url(filename, cache_dir=None):
     return url, etag
 
 
-def cached_path(url_or_filename, cache_dir=None):
+def cached_path(url_or_filename, cache_dir=None, force_download=False):
     """
     Given something that might be a URL (or might be a local path),
     determine which. If it's a URL, download the file and cache it, and
     return the path to the cached file. If it's already a local path,
     make sure the file exists and then return the path.
+    Args:
+        cache_dir: specify a cache directory to save the file to (overwrite the default cache dir).
+        force_download: if True, re-dowload the file even if it's already cached in the cache dir.
     """
     if cache_dir is None:
         cache_dir = PYTORCH_TRANSFORMERS_CACHE
@@ -111,7 +114,7 @@ def cached_path(url_or_filename, cache_dir=None):
 
     if parsed.scheme in ('http', 'https', 's3'):
         # URL, so get it from the cache (downloading if necessary)
-        return get_from_cache(url_or_filename, cache_dir)
+        return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download)
     elif os.path.exists(url_or_filename):
         # File, and it exists.
         return url_or_filename
@@ -184,7 +187,7 @@ def http_get(url, temp_file):
     progress.close()
 
 
-def get_from_cache(url, cache_dir=None):
+def get_from_cache(url, cache_dir=None, force_download=False):
     """
     Given a URL, look for the corresponding dataset in the local cache.
     If it's not there, download it. Then return the path to the cached file.
@@ -227,11 +230,11 @@ def get_from_cache(url, cache_dir=None):
         if matching_files:
             cache_path = os.path.join(cache_dir, matching_files[-1])
 
-    if not os.path.exists(cache_path):
+    if not os.path.exists(cache_path) or force_download:
         # Download to temporary file, then copy to cache dir once finished.
         # Otherwise you get corrupt cache entries if the download gets interrupted.
         with tempfile.NamedTemporaryFile() as temp_file:
-            logger.info("%s not found in cache, downloading to %s", url, temp_file.name)
+            logger.info("%s not found in cache or force_download set to True, downloading to %s", url, temp_file.name)
 
             # GET file object
             if url.startswith("s3://"):
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index edc6b3903e..3e4fbca132 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -125,6 +125,9 @@ class PretrainedConfig(object):
                 - The values in kwargs of any keys which are configuration attributes will be used to override the loaded values.
                 - Behavior concerning key/value pairs whose keys are *not* configuration attributes is controlled by the `return_unused_kwargs` keyword parameter.
 
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
             return_unused_kwargs: (`optional`) bool:
 
                 - If False, then this function returns just the final configuration object.
@@ -146,6 +149,7 @@ class PretrainedConfig(object):
 
         """
         cache_dir = kwargs.pop('cache_dir', None)
+        force_download = kwargs.pop('force_download', False)
         return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
 
         if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
@@ -156,7 +160,7 @@ class PretrainedConfig(object):
             config_file = pretrained_model_name_or_path
         # redirect to the cache, if necessary
         try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir)
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download)
         except EnvironmentError:
             if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
                 logger.error(
@@ -400,6 +404,9 @@ class PreTrainedModel(nn.Module):
                 Path to a directory in which a downloaded pre-trained model
                 configuration should be cached if the standard cache should not be used.
 
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
             output_loading_info: (`optional`) boolean:
                 Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
 
@@ -424,6 +431,7 @@ class PreTrainedModel(nn.Module):
         state_dict = kwargs.pop('state_dict', None)
         cache_dir = kwargs.pop('cache_dir', None)
         from_tf = kwargs.pop('from_tf', False)
+        force_download = kwargs.pop('force_download', False)
         output_loading_info = kwargs.pop('output_loading_info', False)
 
         # Load config
@@ -431,6 +439,7 @@ class PreTrainedModel(nn.Module):
             config, model_kwargs = cls.config_class.from_pretrained(
                 pretrained_model_name_or_path, *model_args,
                 cache_dir=cache_dir, return_unused_kwargs=True,
+                force_download=force_download,
                 **kwargs
             )
         else:
@@ -453,7 +462,7 @@ class PreTrainedModel(nn.Module):
                 archive_file = pretrained_model_name_or_path
         # redirect to the cache, if necessary
         try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir)
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download)
         except EnvironmentError:
             if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
                 logger.error(
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 74d50b385d..763c0cee04 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -193,6 +193,9 @@ class PreTrainedTokenizer(object):
             cache_dir: (`optional`) string:
                 Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the standard cache should not be used.
 
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the vocabulary files and override the cached versions if they exists.
+
             inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
 
             kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
@@ -223,6 +226,7 @@ class PreTrainedTokenizer(object):
     @classmethod
     def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         cache_dir = kwargs.pop('cache_dir', None)
+        force_download = kwargs.pop('force_download', False)
 
         s3_models = list(cls.max_model_input_sizes.keys())
         vocab_files = {}
@@ -283,7 +287,7 @@ class PreTrainedTokenizer(object):
                 if file_path is None:
                     resolved_vocab_files[file_id] = None
                 else:
-                    resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir)
+                    resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download)
         except EnvironmentError:
             if pretrained_model_name_or_path in s3_models:
                 logger.error("Couldn't reach server to download vocabulary.")

From e239a4a20fbb901e60ffcafc06bfefcbb67eaa65 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 20 Aug 2019 11:02:00 +0200
Subject: [PATCH 19/36] close #984

---
 docs/source/pretrained_models.rst | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index 987882d12e..6a14e3dcd1 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -72,16 +72,16 @@ Here is the full list of the currently provided pretrained models together with
 |                   | ``xlnet-large-cased``                                      | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | XLNet Large English model                                                                                                           |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| XLM               | ``xlm-mlm-en-2048``                                        | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+| XLM               | ``xlm-mlm-en-2048``                                        | | 12-layer, 2048-hidden, 16-heads                                                                                                      |
 |                   |                                                            | | XLM English model                                                                                                                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-ende-1024``                                      | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   | ``xlm-mlm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                      |
 |                   |                                                            | | XLM English-German Multi-language model                                                                                             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-enfr-1024``                                      | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   | ``xlm-mlm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                      |
 |                   |                                                            | | XLM English-French Multi-language model                                                                                             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-enro-1024``                                      | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   | ``xlm-mlm-enro-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                      |
 |                   |                                                            | | XLM English-Romanian Multi-language model                                                                                           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-xnli15-1024``                                    | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
@@ -93,7 +93,7 @@ Here is the full list of the currently provided pretrained models together with
 |                   | ``xlm-clm-enfr-1024``                                      | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
 |                   |                                                            | | XLM English model trained with CLM (Causal Language Modeling)                                                                       |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-clm-ende-1024``                                      | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   | ``xlm-clm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                      |
 |                   |                                                            | | XLM English-German Multi-language model trained with CLM (Causal Language Modeling)                                                 |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | RoBERTa           | ``roberta-base``                                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |

From 901dde0e4583a00dc7e486aca6cda7acb647dea9 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 20 Aug 2019 11:05:51 +0200
Subject: [PATCH 20/36] fix #1014

---
 pytorch_transformers/tokenization_bert.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index 177d26dec1..04f35aa466 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -187,6 +187,8 @@ class BertTokenizer(PreTrainedTokenizer):
         index = 0
         if os.path.isdir(vocab_path):
             vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+        else:
+            vocab_file = vocab_path
         with open(vocab_file, "w", encoding="utf-8") as writer:
             for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
                 if index != token_index:

From 53c8f700f4704a58f4684674ced1c57d6ca9240c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 20 Aug 2019 11:29:26 +0200
Subject: [PATCH 21/36] fix #808

---
 pytorch_transformers/modeling_bert.py       | 5 ++++-
 pytorch_transformers/modeling_gpt2.py       | 2 ++
 pytorch_transformers/modeling_openai.py     | 2 ++
 pytorch_transformers/modeling_roberta.py    | 4 ++++
 pytorch_transformers/modeling_transfo_xl.py | 2 ++
 pytorch_transformers/modeling_xlm.py        | 4 ++++
 pytorch_transformers/modeling_xlnet.py      | 2 ++
 7 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 9c20eac9bf..7b34b3fd90 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -599,7 +599,10 @@ BERT_INPUTS_DOCSTRING = r"""
                 ``tokens:         [CLS] the dog is hairy . [SEP]``
                 
                 ``token_type_ids:   0   0   0   0  0     0   0``
-    
+
+            Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
             Indices can be obtained using :class:`pytorch_transformers.BertTokenizer`.
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index f67d0e88d5..91d01d0584 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -390,6 +390,8 @@ GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
 GPT2_INPUTS_DOCSTRING = r"""    Inputs:
         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
+            GPT-2 is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
             Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index e8648487be..71ffb78e0f 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -404,6 +404,8 @@ OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
 OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
+            GPT is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
             Indices can be obtained using :class:`pytorch_transformers.BPT2Tokenizer`.
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py
index e3065cf60b..e49b2a06b1 100644
--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -110,6 +110,10 @@ ROBERTA_INPUTS_DOCSTRING = r"""
 
             Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with 
             the ``add_special_tokens`` parameter set to ``True``.
+
+            RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index 553a71fffe..3cfdee38cb 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -936,6 +936,8 @@ TRANSFO_XL_INPUTS_DOCSTRING = r"""
     Inputs:
         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
+            Transformer-XL is a model with relative position embeddings so you can either pad the inputs on
+            the right or on the left.
             Indices can be obtained using :class:`pytorch_transformers.TransfoXLTokenizer`.
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index d01d245bbb..be2767ed0c 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -424,6 +424,10 @@ XLM_INPUTS_DOCSTRING = r"""
     Inputs:
         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
+
+            XLM is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
             Indices can be obtained using :class:`pytorch_transformers.XLMTokenizer`.
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index af33c5a6c2..d44821788e 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -655,6 +655,8 @@ XLNET_INPUTS_DOCSTRING = r"""
     Inputs:
         **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
+            XLNet is a model with relative position embeddings so you can either pad the inputs on
+            the right or on the left.
             Indices can be obtained using :class:`pytorch_transformers.XLNetTokenizer`.
             See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
             :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.

From b0b9b8091b73f929306704bd8cd62b712621cebc Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 20 Aug 2019 11:33:46 +0200
Subject: [PATCH 22/36] minor typo

---
 pytorch_transformers/modeling_gpt2.py   | 2 +-
 pytorch_transformers/modeling_openai.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index f67d0e88d5..dd3e465bf3 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -614,7 +614,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
 @add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
-the classification head takes as input the input of a specified classification token index in the intput sequence).
+the classification head takes as input the input of a specified classification token index in the input sequence).
 """, GPT2_START_DOCSTRING)
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     r"""    Inputs:
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index e8648487be..a4f02111e7 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -604,7 +604,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
 @add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
-the classification head takes as input the input of a specified classification token index in the intput sequence).
+the classification head takes as input the input of a specified classification token index in the input sequence).
 """, OPENAI_GPT_START_DOCSTRING)
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     r"""    Inputs:

From 6d0aa73981f15618cf8d01255b07194e946c3286 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 20 Aug 2019 12:20:21 +0200
Subject: [PATCH 23/36] fix #1034

---
 pytorch_transformers/modeling_xlm.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index be2767ed0c..19800da2ed 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -440,8 +440,10 @@ XLM_INPUTS_DOCSTRING = r"""
             Indices are selected in the vocabulary (unlike BERT which has a specific vocabulary for segment indices).
         **langs**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             A parallel sequence of tokens to be used to indicate the language of each token in the input.
-            Indices are selected in the pre-trained language vocabulary,
-            i.e. in the range ``[0, config.n_langs - 1[``.
+            Indices are languages ids which can be obtained from the language names by using two conversion mappings
+            provided in the configuration of the model (only provided for multilingual models).
+            More precisely, the `language name -> language id` mapping is in `model.config.lang2id` (dict str -> int) and
+            the `language id -> language name` mapping is `model.config.id2lang` (dict int -> str).
         **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:

From bfd75056b0a080addafb7f3d7c9336d27b883a0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?=
 <37592763+GuillemGSubies@users.noreply.github.com>
Date: Tue, 20 Aug 2019 14:06:17 +0200
Subject: [PATCH 24/36] Update tokenization_xlm.py

---
 pytorch_transformers/tokenization_xlm.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index b690a3a945..8e7c2954f2 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -124,8 +124,9 @@ class XLMTokenizer(PreTrainedTokenizer):
                                            **kwargs)
         try:
             import ftfy
-            import spacy
-            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
+            from spacy.lang.en import English
+            _nlp = English()
+            self.nlp = nlp.Defaults.create_tokenizer(_nlp)
             self.fix_text = ftfy.fix_text
         except ImportError:
             logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")

From bb04446285be43059050406b3bc4938807c63c25 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?=
 <37592763+GuillemGSubies@users.noreply.github.com>
Date: Tue, 20 Aug 2019 14:07:40 +0200
Subject: [PATCH 25/36] Update tokenization_openai.py

---
 pytorch_transformers/tokenization_openai.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py
index 0eb5281d39..0f6a8f1dae 100644
--- a/pytorch_transformers/tokenization_openai.py
+++ b/pytorch_transformers/tokenization_openai.py
@@ -89,9 +89,9 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
 
         try:
             import ftfy
-            import spacy
-            self.nlp = spacy.load('en', disable=['parser', 'tagger', 'ner', 'textcat'])
-            self.fix_text = ftfy.fix_text
+            from spacy.lang.en import English
+            _nlp = English()
+            self.nlp = nlp.Defaults.create_tokenizer(_nlp)
         except ImportError:
             logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
             self.nlp = BasicTokenizer(do_lower_case=True)

From 562b998366c7a4a2bd0addf1a860fbee0aa04d74 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?=
 <37592763+GuillemGSubies@users.noreply.github.com>
Date: Tue, 20 Aug 2019 14:10:19 +0200
Subject: [PATCH 26/36] Update tokenization_openai.py

---
 pytorch_transformers/tokenization_openai.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py
index 0f6a8f1dae..79eb023a8d 100644
--- a/pytorch_transformers/tokenization_openai.py
+++ b/pytorch_transformers/tokenization_openai.py
@@ -92,6 +92,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
             from spacy.lang.en import English
             _nlp = English()
             self.nlp = nlp.Defaults.create_tokenizer(_nlp)
+            self.fix_text = ftfy.fix_text
         except ImportError:
             logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")
             self.nlp = BasicTokenizer(do_lower_case=True)

From f5e2ed0fd89d5730126d71c03324fa07ae674ca7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?=
 <37592763+GuillemGSubies@users.noreply.github.com>
Date: Tue, 20 Aug 2019 14:19:25 +0200
Subject: [PATCH 27/36] Update tokenization_openai.py

---
 pytorch_transformers/tokenization_openai.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/tokenization_openai.py b/pytorch_transformers/tokenization_openai.py
index 79eb023a8d..51b418ebd3 100644
--- a/pytorch_transformers/tokenization_openai.py
+++ b/pytorch_transformers/tokenization_openai.py
@@ -91,7 +91,7 @@ class OpenAIGPTTokenizer(PreTrainedTokenizer):
             import ftfy
             from spacy.lang.en import English
             _nlp = English()
-            self.nlp = nlp.Defaults.create_tokenizer(_nlp)
+            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
             self.fix_text = ftfy.fix_text
         except ImportError:
             logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")

From 388e3251fa95b892949968dc89065e464a93b69f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillem=20Garc=C3=ADa=20Subies?=
 <37592763+GuillemGSubies@users.noreply.github.com>
Date: Tue, 20 Aug 2019 14:19:39 +0200
Subject: [PATCH 28/36] Update tokenization_xlm.py

---
 pytorch_transformers/tokenization_xlm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 8e7c2954f2..2d2f3a8cd4 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -126,7 +126,7 @@ class XLMTokenizer(PreTrainedTokenizer):
             import ftfy
             from spacy.lang.en import English
             _nlp = English()
-            self.nlp = nlp.Defaults.create_tokenizer(_nlp)
+            self.nlp = _nlp.Defaults.create_tokenizer(_nlp)
             self.fix_text = ftfy.fix_text
         except ImportError:
             logger.warning("ftfy or spacy is not installed using BERT BasicTokenizer instead of SpaCy & ftfy.")

From ad6e62cd827d546691845aca5fb9b437c5812d6a Mon Sep 17 00:00:00 2001
From: Nikolay Korolev <korolevns98@gmail.com>
Date: Tue, 20 Aug 2019 15:43:06 +0300
Subject: [PATCH 29/36] Fix typo. configuratoin -> configuration

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7d2445fc11..4e57de5842 100644
--- a/README.md
+++ b/README.md
@@ -328,7 +328,7 @@ Breaking change in the `from_pretrained()`method:
 
 1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
 
-2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuratoin class attributes.
+2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
 
 Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
 

From 43489756ad421a99d0f3eb9d83116b9b4904c922 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 20 Aug 2019 16:59:11 +0200
Subject: [PATCH 30/36] adding proxies options for the from_pretrained methods

---
 .gitignore                                 |  4 ++-
 pytorch_transformers/file_utils.py         | 29 +++++++++++-----------
 pytorch_transformers/modeling_utils.py     | 14 +++++++++--
 pytorch_transformers/tokenization_utils.py |  7 +++++-
 4 files changed, 36 insertions(+), 18 deletions(-)

diff --git a/.gitignore b/.gitignore
index 6bbe32df6c..466a167552 100644
--- a/.gitignore
+++ b/.gitignore
@@ -127,4 +127,6 @@ proc_data
 
 # examples
 runs
-examples/runs
\ No newline at end of file
+examples/runs
+
+data
\ No newline at end of file
diff --git a/pytorch_transformers/file_utils.py b/pytorch_transformers/file_utils.py
index 074e6743ef..f6f2151b12 100644
--- a/pytorch_transformers/file_utils.py
+++ b/pytorch_transformers/file_utils.py
@@ -17,8 +17,9 @@ from hashlib import sha256
 from io import open
 
 import boto3
-import requests
+from botocore.config import Config
 from botocore.exceptions import ClientError
+import requests
 from tqdm import tqdm
 
 try:
@@ -93,7 +94,7 @@ def filename_to_url(filename, cache_dir=None):
     return url, etag
 
 
-def cached_path(url_or_filename, cache_dir=None, force_download=False):
+def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=None):
     """
     Given something that might be a URL (or might be a local path),
     determine which. If it's a URL, download the file and cache it, and
@@ -114,7 +115,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False):
 
     if parsed.scheme in ('http', 'https', 's3'):
         # URL, so get it from the cache (downloading if necessary)
-        return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download)
+        return get_from_cache(url_or_filename, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
     elif os.path.exists(url_or_filename):
         # File, and it exists.
         return url_or_filename
@@ -159,24 +160,24 @@ def s3_request(func):
 
 
 @s3_request
-def s3_etag(url):
+def s3_etag(url, proxies=None):
     """Check ETag on S3 object."""
-    s3_resource = boto3.resource("s3")
+    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
     bucket_name, s3_path = split_s3_path(url)
     s3_object = s3_resource.Object(bucket_name, s3_path)
     return s3_object.e_tag
 
 
 @s3_request
-def s3_get(url, temp_file):
+def s3_get(url, temp_file, proxies=None):
     """Pull a file directly from S3."""
-    s3_resource = boto3.resource("s3")
+    s3_resource = boto3.resource("s3", config=Config(proxies=proxies))
     bucket_name, s3_path = split_s3_path(url)
     s3_resource.Bucket(bucket_name).download_fileobj(s3_path, temp_file)
 
 
-def http_get(url, temp_file):
-    req = requests.get(url, stream=True)
+def http_get(url, temp_file, proxies=None):
+    req = requests.get(url, stream=True, proxies=proxies)
     content_length = req.headers.get('Content-Length')
     total = int(content_length) if content_length is not None else None
     progress = tqdm(unit="B", total=total)
@@ -187,7 +188,7 @@ def http_get(url, temp_file):
     progress.close()
 
 
-def get_from_cache(url, cache_dir=None, force_download=False):
+def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
     """
     Given a URL, look for the corresponding dataset in the local cache.
     If it's not there, download it. Then return the path to the cached file.
@@ -204,10 +205,10 @@ def get_from_cache(url, cache_dir=None, force_download=False):
 
     # Get eTag to add to filename, if it exists.
     if url.startswith("s3://"):
-        etag = s3_etag(url)
+        etag = s3_etag(url, proxies=proxies)
     else:
         try:
-            response = requests.head(url, allow_redirects=True)
+            response = requests.head(url, allow_redirects=True, proxies=proxies)
             if response.status_code != 200:
                 etag = None
             else:
@@ -238,9 +239,9 @@ def get_from_cache(url, cache_dir=None, force_download=False):
 
             # GET file object
             if url.startswith("s3://"):
-                s3_get(url, temp_file)
+                s3_get(url, temp_file, proxies=proxies)
             else:
-                http_get(url, temp_file)
+                http_get(url, temp_file, proxies=proxies)
 
             # we are copying the file before closing it, so flush to avoid truncation
             temp_file.flush()
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 3e4fbca132..f1501aa8d5 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -128,6 +128,10 @@ class PretrainedConfig(object):
             force_download: (`optional`) boolean, default False:
                 Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
 
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
             return_unused_kwargs: (`optional`) bool:
 
                 - If False, then this function returns just the final configuration object.
@@ -150,6 +154,7 @@ class PretrainedConfig(object):
         """
         cache_dir = kwargs.pop('cache_dir', None)
         force_download = kwargs.pop('force_download', False)
+        proxies = kwargs.pop('proxies', None)
         return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
 
         if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
@@ -160,7 +165,7 @@ class PretrainedConfig(object):
             config_file = pretrained_model_name_or_path
         # redirect to the cache, if necessary
         try:
-            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download)
+            resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
         except EnvironmentError:
             if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
                 logger.error(
@@ -407,6 +412,10 @@ class PreTrainedModel(nn.Module):
             force_download: (`optional`) boolean, default False:
                 Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
 
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
             output_loading_info: (`optional`) boolean:
                 Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
 
@@ -432,6 +441,7 @@ class PreTrainedModel(nn.Module):
         cache_dir = kwargs.pop('cache_dir', None)
         from_tf = kwargs.pop('from_tf', False)
         force_download = kwargs.pop('force_download', False)
+        proxies = kwargs.pop('proxies', None)
         output_loading_info = kwargs.pop('output_loading_info', False)
 
         # Load config
@@ -462,7 +472,7 @@ class PreTrainedModel(nn.Module):
                 archive_file = pretrained_model_name_or_path
         # redirect to the cache, if necessary
         try:
-            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download)
+            resolved_archive_file = cached_path(archive_file, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
         except EnvironmentError:
             if pretrained_model_name_or_path in cls.pretrained_model_archive_map:
                 logger.error(
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 763c0cee04..68af97a518 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -196,6 +196,10 @@ class PreTrainedTokenizer(object):
             force_download: (`optional`) boolean, default False:
                 Force to (re-)download the vocabulary files and override the cached versions if they exists.
 
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
             inputs: (`optional`) positional arguments: will be passed to the Tokenizer ``__init__`` method.
 
             kwargs: (`optional`) keyword arguments: will be passed to the Tokenizer ``__init__`` method. Can be used to set special tokens like ``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``. See parameters in the doc string of :class:`~pytorch_transformers.PreTrainedTokenizer` for details.
@@ -227,6 +231,7 @@ class PreTrainedTokenizer(object):
     def _from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
         cache_dir = kwargs.pop('cache_dir', None)
         force_download = kwargs.pop('force_download', False)
+        proxies = kwargs.pop('proxies', None)
 
         s3_models = list(cls.max_model_input_sizes.keys())
         vocab_files = {}
@@ -287,7 +292,7 @@ class PreTrainedTokenizer(object):
                 if file_path is None:
                     resolved_vocab_files[file_id] = None
                 else:
-                    resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download)
+                    resolved_vocab_files[file_id] = cached_path(file_path, cache_dir=cache_dir, force_download=force_download, proxies=proxies)
         except EnvironmentError:
             if pretrained_model_name_or_path in s3_models:
                 logger.error("Couldn't reach server to download vocabulary.")

From 3bffd2e8e5d726d581e0a66746b25c64d49e231d Mon Sep 17 00:00:00 2001
From: Peng Qi <qipeng@users.noreply.github.com>
Date: Tue, 20 Aug 2019 10:59:28 -0700
Subject: [PATCH 31/36] more fixes

---
 examples/run_glue.py  | 2 +-
 examples/run_squad.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 7fb0732e61..1729f4f7e3 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -467,7 +467,7 @@ def main():
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         model.to(args.device)
 
 
diff --git a/examples/run_squad.py b/examples/run_squad.py
index efa835107c..c0586b03bd 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -481,7 +481,7 @@ def main():
 
 
     # Save the trained model and the tokenizer
-    if args.do_train and args.local_rank == -1 or torch.distributed.get_rank() == 0:
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
         if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
             os.makedirs(args.output_dir)

From aa05dc8935a3e5b349abecbdc5399796578fe965 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 21 Aug 2019 02:29:34 +0200
Subject: [PATCH 32/36] adding gpt-2 large

---
 pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py  | 2 +-
 .../convert_openai_checkpoint_to_pytorch.py                 | 2 +-
 .../convert_transfo_xl_checkpoint_to_pytorch.py             | 2 +-
 pytorch_transformers/modeling_gpt2.py                       | 6 ++++--
 pytorch_transformers/tokenization_gpt2.py                   | 2 ++
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
index f9e83f5d5b..e9bfa0302a 100755
--- a/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_gpt2_checkpoint_to_pytorch.py
@@ -35,7 +35,7 @@ def convert_gpt2_checkpoint_to_pytorch(gpt2_checkpoint_path, gpt2_config_file, p
     if gpt2_config_file == "":
         config = GPT2Config()
     else:
-        config = GPT2Config(gpt2_config_file)
+        config = GPT2Config.from_json_file(gpt2_config_file)
     model = GPT2Model(config)
 
     # Load weights from numpy
diff --git a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
index 70895b4002..3009f8a99e 100755
--- a/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_openai_checkpoint_to_pytorch.py
@@ -35,7 +35,7 @@ def convert_openai_checkpoint_to_pytorch(openai_checkpoint_folder_path, openai_c
     if openai_config_file == "":
         config = OpenAIGPTConfig()
     else:
-        config = OpenAIGPTConfig(openai_config_file)
+        config = OpenAIGPTConfig.from_json_file(openai_config_file)
     model = OpenAIGPTModel(config)
 
     # Load weights from numpy
diff --git a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
index 5733146444..7e79d58d7d 100755
--- a/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_transfo_xl_checkpoint_to_pytorch.py
@@ -75,7 +75,7 @@ def convert_transfo_xl_checkpoint_to_pytorch(tf_checkpoint_path,
         if transfo_xl_config_file == "":
             config = TransfoXLConfig()
         else:
-            config = TransfoXLConfig(transfo_xl_config_file)
+            config = TransfoXLConfig.from_json_file(transfo_xl_config_file)
         print("Building PyTorch model from configuration: {}".format(str(config)))
         model = TransfoXLLMHeadModel(config)
 
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index cb4b8cc4ab..9022048d6d 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -38,9 +38,11 @@ from .modeling_bert import BertLayerNorm as LayerNorm
 logger = logging.getLogger(__name__)
 
 GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
-                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin"}
+                                     "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
+                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
 GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
-                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json"}
+                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
+                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
 
 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index 0aee856180..4016a85a7f 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -45,11 +45,13 @@ PRETRAINED_VOCAB_FILES_MAP = {
     {
         'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
         'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
+        'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
     },
     'merges_file':
     {
         'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
         'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
+        'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
     },
 }
 

From fdc487d8b33dcb8b2ddebd7a1fe4bd0eee4e2a40 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 21 Aug 2019 02:35:01 +0200
Subject: [PATCH 33/36] Add max length

---
 pytorch_transformers/tokenization_gpt2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pytorch_transformers/tokenization_gpt2.py b/pytorch_transformers/tokenization_gpt2.py
index 4016a85a7f..e67f25ff59 100644
--- a/pytorch_transformers/tokenization_gpt2.py
+++ b/pytorch_transformers/tokenization_gpt2.py
@@ -58,6 +58,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'gpt2': 1024,
     'gpt2-medium': 1024,
+    'gpt2-large': 1024,
 }
 
 @lru_cache()

From 6f877d9daf36788bad4fd228930939fed6ab12bd Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 21 Aug 2019 03:43:29 +0000
Subject: [PATCH 34/36] Update dev results on GLUE (bert-base-uncased) w/
 median on 5 runs

---
 docs/source/examples.rst | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/docs/source/examples.rst b/docs/source/examples.rst
index 51c8d850b9..7777117b47 100644
--- a/docs/source/examples.rst
+++ b/docs/source/examples.rst
@@ -68,7 +68,9 @@ GLUE results on dev set
 ~~~~~~~~~~~~~~~~~~~~~~~
 
 We get the following results on the dev set of GLUE benchmark with an uncased BERT base
-model. All experiments were run on a P100 GPU with a batch size of 32.
+model (`bert-base-uncased`). All experiments ran on 8 V100 GPUs with a total train batch size of 24. Some of 
+these tasks have a small dataset and training can lead to high variance in the results between different runs.
+We report the median on 5 runs (with different seeds) for each of the metrics.
 
 .. list-table::
    :header-rows: 1
@@ -78,31 +80,31 @@ model. All experiments were run on a P100 GPU with a batch size of 32.
      - Result
    * - CoLA
      - Matthew's corr.
-     - 57.29
+     - 55.75
    * - SST-2
      - accuracy
-     - 93.00
+     - 92.09
    * - MRPC
      - F1/accuracy
-     - 88.85/83.82
+     - 90.48/86.27
    * - STS-B
      - Pearson/Spearman corr.
-     - 89.70/89.37
+     - 89.03/88.64
    * - QQP
      - accuracy/F1
-     - 90.72/87.41
+     - 90.92/87.72
    * - MNLI
      - matched acc./mismatched acc.
-     - 83.95/84.39
+     - 83.74/84.06
    * - QNLI
      - accuracy
-     - 89.04
+     - 91.07
    * - RTE
      - accuracy
-     - 61.01
+     - 68.59
    * - WNLI
      - accuracy
-     - 53.52
+     - 43.66
 
 
 Some of these results are significantly different from the ones reported on the test set

From 2f9397139d1be373efa76b8133d71e1bdc43bbb3 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Wed, 21 Aug 2019 11:29:37 -0400
Subject: [PATCH 35/36] Added GPT-2 LARGE to Pre-trained Models documentation

---
 docs/source/pretrained_models.rst | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index 6a14e3dcd1..7df70ea225 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -62,6 +62,9 @@ Here is the full list of the currently provided pretrained models together with
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``gpt2-medium``                                            | | 24-layer, 1024-hidden, 16-heads, 345M parameters.                                                                                   |
 |                   |                                                            | | OpenAI's Medium-sized GPT-2 English model                                                                                           |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``gpt2-large``                                             | | 36-layer, 1280-hidden, 20-heads, 774M parameters.                                                                                   |
+|                   |                                                            | | OpenAI's Large-sized GPT-2 English model                                                                                            |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | Transformer-XL    | ``transfo-xl-wt103``                                       | | 18-layer, 1024-hidden, 16-heads, 257M parameters.                                                                                   |
 |                   |                                                            | | English model trained on wikitext-103                                                                                               |
@@ -72,16 +75,16 @@ Here is the full list of the currently provided pretrained models together with
 |                   | ``xlnet-large-cased``                                      | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
 |                   |                                                            | | XLNet Large English model                                                                                                           |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| XLM               | ``xlm-mlm-en-2048``                                        | | 12-layer, 2048-hidden, 16-heads                                                                                                      |
+| XLM               | ``xlm-mlm-en-2048``                                        | | 12-layer, 2048-hidden, 16-heads                                                                                                     |
 |                   |                                                            | | XLM English model                                                                                                                   |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   | ``xlm-mlm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
 |                   |                                                            | | XLM English-German Multi-language model                                                                                             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   | ``xlm-mlm-enfr-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
 |                   |                                                            | | XLM English-French Multi-language model                                                                                             |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-enro-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   | ``xlm-mlm-enro-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
 |                   |                                                            | | XLM English-Romanian Multi-language model                                                                                           |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-mlm-xnli15-1024``                                    | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
@@ -93,7 +96,7 @@ Here is the full list of the currently provided pretrained models together with
 |                   | ``xlm-clm-enfr-1024``                                      | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
 |                   |                                                            | | XLM English model trained with CLM (Causal Language Modeling)                                                                       |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-clm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   | ``xlm-clm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
 |                   |                                                            | | XLM English-German Multi-language model trained with CLM (Causal Language Modeling)                                                 |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | RoBERTa           | ``roberta-base``                                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |

From e00b4ff1de0591d5093407b16e665e5c86028f04 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 21 Aug 2019 22:22:17 +0200
Subject: [PATCH 36/36] fix #1017

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4e57de5842..9751c720b8 100644
--- a/README.md
+++ b/README.md
@@ -393,8 +393,8 @@ for batch in train_data:
     loss = model(batch)
     loss.backward()
     torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
-    scheduler.step()
     optimizer.step()
+    scheduler.step()
     optimizer.zero_grad()
 ```