From ec94f4e0f80d33433cbb2c14fd694af33656b779 Mon Sep 17 00:00:00 2001
From: Simon Layton <slayton58@users.noreply.github.com>
Date: Wed, 18 Sep 2019 09:30:58 -0400
Subject: [PATCH 01/55] Fix fp16 masking in PoolerEndLogits

Necessary to run xlnet (at least in squad) with `--fp16 --fp16_opt_level="O2"`, otherwise loss is immediately `NaN` and fine-tuning cannot proceed.
---
 pytorch_transformers/modeling_utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 25aeefe10f..fdc8415fa6 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -478,7 +478,10 @@ class PoolerEndLogits(nn.Module):
         x = self.dense_1(x).squeeze(-1)
 
         if p_mask is not None:
-            x = x * (1 - p_mask) - 1e30 * p_mask
+            if next(self.parameters()).dtype == torch.float16:
+                x = x * (1 - p_mask) - 65500 * p_mask
+            else:
+                x = x * (1 - p_mask) - 1e30 * p_mask
 
         return x
 

From f0340eccf90a81462ca025ef21350bdb13f22b42 Mon Sep 17 00:00:00 2001
From: Erik Chan <erik.chan@gmail.com>
Date: Wed, 18 Sep 2019 13:42:11 -0700
Subject: [PATCH 02/55] Typo

Typo
---
 examples/run_lm_finetuning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 4d14fe7ebb..dad9fab83f 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -57,7 +57,7 @@ class TextDataset(Dataset):
     def __init__(self, tokenizer, file_path='train', block_size=512):
         assert os.path.isfile(file_path)
         directory, filename = os.path.split(file_path)
-        cached_features_file = os.path.join(directory, f'cached_lm_{block_size}_{filename}')
+        cached_features_file = os.path.join(directory, 'cached_lm_{block_size}_{filename}')
 
         if os.path.exists(cached_features_file):
             logger.info("Loading features from cached file %s", cached_features_file)

From 2e6797cc7d467bc2242c54fe61ae61891d19677f Mon Sep 17 00:00:00 2001
From: danai-antoniou <danaiantoniou@monzo.com>
Date: Thu, 19 Sep 2019 15:40:42 +0100
Subject: [PATCH 03/55] Added valuerror for duplicate added tokens

---
 pytorch_transformers/tokenization_utils.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 1e2cd59648..bdc0ec7d3c 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -503,6 +503,9 @@ class PreTrainedTokenizer(object):
         if not new_tokens:
             return 0
 
+        if len(new_tokens) != len(set(new_tokens)):
+            raise ValueError("The provided list of tokens contains duplicates.")
+
         to_add_tokens = []
         for token in new_tokens:
             assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))

From 4b543c3007a57441550d87d5d61f06f7938d7140 Mon Sep 17 00:00:00 2001
From: Lorenzo Ampil <lorenzojulioampil@gmail.com>
Date: Sun, 22 Sep 2019 21:38:38 +0800
Subject: [PATCH 04/55] Add option to use a 'stop token' which will be used to
 truncate the output text to everything till right before the 'stop token'

---
 examples/run_generation.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/run_generation.py b/examples/run_generation.py
index a2a8f29103..27bc14e313 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -145,6 +145,8 @@ def main():
                         help="Avoid using CUDA when available")
     parser.add_argument('--seed', type=int, default=42,
                         help="random seed for initialization")
+    parser.add_argument('--stop_token', type=str, default=None,
+                        help="Token at which text generation is stopped")
     args = parser.parse_args()
 
     args.device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
@@ -185,6 +187,7 @@ def main():
         )
         out = out[0, len(context_tokens):].tolist()
         text = tokenizer.decode(out, clean_up_tokenization_spaces=True)
+        text = text[: text.find(args.stop_token) if args.stop_token else None]
         print(text)
         if args.prompt:
             break

From 0a4ed7192ec69023f9c8db913b5923e00a92598e Mon Sep 17 00:00:00 2001
From: Tim Yagan <30977192+TimYagan@users.noreply.github.com>
Date: Sun, 29 Sep 2019 13:51:01 +0200
Subject: [PATCH 05/55] Fixed critical css font-family issues

Fixed critical css font-family issues to ensure compatibility with multiple webbrowsers
---
 docs/source/_static/css/huggingface.css | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)

diff --git a/docs/source/_static/css/huggingface.css b/docs/source/_static/css/huggingface.css
index 84740cb4df..3f006a996b 100644
--- a/docs/source/_static/css/huggingface.css
+++ b/docs/source/_static/css/huggingface.css
@@ -1,5 +1,3 @@
-huggingface.css
-
 /* The literal code blocks */
 .rst-content tt.literal, .rst-content tt.literal, .rst-content code.literal {
     color: #6670FF;
@@ -44,11 +42,11 @@ huggingface.css
 /* The text items on the toc tree */
 .wy-menu-vertical a {
     color: #FFFFDD;
-    font-family: Calibre-Light;
+    font-family: Calibre-Light, sans-serif;
 }
 .wy-menu-vertical header, .wy-menu-vertical p.caption{
     color: white;
-    font-family: Calibre-Light;
+    font-family: Calibre-Light, sans-serif;
 }
 
 /* The color inside the selected toc tree block */
@@ -85,7 +83,7 @@ a {
     border-right: solid 2px #FB8D68;
     border-left: solid 2px #FB8D68;
     color: #FB8D68;
-    font-family: Calibre-Light;
+    font-family: Calibre-Light, sans-serif;
     border-top: none;
     font-style: normal !important;
 }
@@ -136,14 +134,14 @@ a {
 
 /* class and method names in doc */
 .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname, .rst-content dl:not(.docutils) tt.descclassname, .rst-content dl:not(.docutils) code.descclassname{
-    font-family: Calibre;
+    font-family: Calibre, sans-serif;
     font-size: 20px !important;
 }
 
 /* class name in doc*/
 .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) tt.descname, .rst-content dl:not(.docutils) code.descname{
     margin-right: 10px;
-    font-family: Calibre-Medium;
+    font-family: Calibre-Medium, sans-serif;
 }
 
 /* Method and class parameters */
@@ -160,17 +158,17 @@ a {
 
 /* FONTS */
 body{
-    font-family: Calibre;
+    font-family: Calibre, sans-serif;
     font-size: 16px;
 }
 
 h1 {
-    font-family: Calibre-Thin;
+    font-family: Calibre-Thin, sans-serif;
     font-size: 70px;
 }
 
 h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
-    font-family: Calibre-Medium;
+    font-family: Calibre-Medium, sans-serif;
 }
 
 @font-face {
@@ -196,4 +194,3 @@ h2, .rst-content .toctree-wrapper p.caption, h3, h4, h5, h6, legend{
     src: url(./Calibre-Thin.otf);
     font-weight:400;
 }
-

From 5c3b32d44d0164aaa9b91405f48e53cf53a82b35 Mon Sep 17 00:00:00 2001
From: Santosh Gupta <Santosh.gupta.eng@gmail.com>
Date: Sat, 28 Sep 2019 16:35:06 -0700
Subject: [PATCH 06/55] Update README.md

Lines 183 - 200, fixed indentation. Line 198, replaced `tokenizer_class` with `BertTokenizer`, since `tokenizer_class` is not defined in the loop it belongs to.
---
 README.md | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/README.md b/README.md
index 8dd2c2fb66..b2c5df77e9 100644
--- a/README.md
+++ b/README.md
@@ -180,24 +180,24 @@ for model_class in BERT_MODEL_CLASSES:
     # Load pretrained model/tokenizer
     model = model_class.from_pretrained('bert-base-uncased')
 
-# Models can return full list of hidden-states & attentions weights at each layer
-model = model_class.from_pretrained(pretrained_weights,
-                                    output_hidden_states=True,
-                                    output_attentions=True)
-input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
-all_hidden_states, all_attentions = model(input_ids)[-2:]
+    # Models can return full list of hidden-states & attentions weights at each layer
+    model = model_class.from_pretrained(pretrained_weights,
+                                        output_hidden_states=True,
+                                        output_attentions=True)
+    input_ids = torch.tensor([tokenizer.encode("Let's see all hidden-states and attentions on this text")])
+    all_hidden_states, all_attentions = model(input_ids)[-2:]
 
-# Models are compatible with Torchscript
-model = model_class.from_pretrained(pretrained_weights, torchscript=True)
-traced_model = torch.jit.trace(model, (input_ids,))
+    # Models are compatible with Torchscript
+    model = model_class.from_pretrained(pretrained_weights, torchscript=True)
+    traced_model = torch.jit.trace(model, (input_ids,))
 
-# Simple serialization for models and tokenizers
-model.save_pretrained('./directory/to/save/')  # save
-model = model_class.from_pretrained('./directory/to/save/')  # re-load
-tokenizer.save_pretrained('./directory/to/save/')  # save
-tokenizer = tokenizer_class.from_pretrained('./directory/to/save/')  # re-load
+    # Simple serialization for models and tokenizers
+    model.save_pretrained('./directory/to/save/')  # save
+    model = model_class.from_pretrained('./directory/to/save/')  # re-load
+    tokenizer.save_pretrained('./directory/to/save/')  # save
+    tokenizer = BertTokenizer.from_pretrained('./directory/to/save/')  # re-load
 
-# SOTA examples for GLUE, SQUAD, text generation...
+    # SOTA examples for GLUE, SQUAD, text generation...
 ```
 
 ## Quick tour TF 2.0 training and PyTorch interoperability

From 6971556ab83a5a3edd2f99d322b0954499393d2b Mon Sep 17 00:00:00 2001
From: DenysNahurnyi <dnahurnyi@gmail.com>
Date: Tue, 1 Oct 2019 21:57:18 +0300
Subject: [PATCH 07/55] Fix syntax typo in README.md

---
 README.md | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index b2c5df77e9..b5b7245bd9 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,7 @@ pip install transformers
 Here also, you first need to install one of, or both, TensorFlow 2.0 and PyTorch.
 Please refere to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
 
-When TensorFlow 2.0 and/or PyTorch has been installed, you can install from source by cloning the repository and runing:
+When TensorFlow 2.0 and/or PyTorch has been installed, you can install from source by cloning the repository and running:
 
 ```bash
 pip install [--editable] .
@@ -88,7 +88,7 @@ pip install [--editable] .
 
 ### Tests
 
-A series of tests is included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
+A series of tests are included for the library and the example scripts. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
 
 These tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
 
@@ -394,7 +394,7 @@ This is the model provided as `bert-large-uncased-whole-word-masking-finetuned-s
 ### `run_generation.py`: Text generation with GPT, GPT-2, Transformer-XL and XLNet
 
 A conditional generation script is also included to generate text from a prompt.
-The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
+The generation script includes the [tricks](https://github.com/rusiaaman/XLNet-gen#methodology) proposed by Aman Rusia to get high-quality generation with memory models like Transformer-XL and XLNet (include a predefined text to make short inputs longer).
 
 Here is how to run the script with the small version of OpenAI GPT-2 model:
 
@@ -426,7 +426,7 @@ Here is a quick summary of what you should take care of when migrating from `pyt
 
 The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
 
-The exact content of the tuples for each model are detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
+The exact content of the tuples for each model is detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
 
 In pretty much every case, you will be fine by taking the first element of the output as the output you previously used in `pytorch-pretrained-bert`.
 
@@ -458,7 +458,7 @@ By enabling the configuration option `output_hidden_states`, it was possible to
 
 ### Serialization
 
-Breaking change in the `from_pretrained()`method:
+Breaking change in the `from_pretrained()` method:
 
 1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
 
@@ -534,4 +534,4 @@ for batch in train_data:
 
 ## Citation
 
-At the moment, there is no paper associated to Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.
+At the moment, there is no paper associated with Transformers but we are working on preparing one. In the meantime, please include a mention of the library and a link to the present repository if you use this work in a published or open-source project.

From f7978f70ecadd3ac6e3072f72e9dcc8c01e187c8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 1 Oct 2019 18:45:38 -0400
Subject: [PATCH 08/55] use format instead of f-strings

---
 examples/run_lm_finetuning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index dad9fab83f..0115dcb98b 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -57,7 +57,7 @@ class TextDataset(Dataset):
     def __init__(self, tokenizer, file_path='train', block_size=512):
         assert os.path.isfile(file_path)
         directory, filename = os.path.split(file_path)
-        cached_features_file = os.path.join(directory, 'cached_lm_{block_size}_{filename}')
+        cached_features_file = os.path.join(directory, 'cached_lm_{}_{}'.format(block_size, filename))
 
         if os.path.exists(cached_features_file):
             logger.info("Loading features from cached file %s", cached_features_file)

From 391db836ab7ed2ca61c51a7cf1b135b6ab92be58 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 1 Oct 2019 19:09:13 -0400
Subject: [PATCH 09/55] fix #1260 - remove special logic for decoding pairs of
 sequence

---
 transformers/tokenization_utils.py | 17 ++++-------------
 1 file changed, 4 insertions(+), 13 deletions(-)

diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 1e20588f83..db9e9cd72e 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -933,20 +933,11 @@ class PreTrainedTokenizer(object):
             sub_texts.append(self.convert_tokens_to_string(current_sub_text))
         text = ''.join(sub_texts)
 
-        if self._sep_token is not None and self._sep_token in text:
-            text = text.replace(self._cls_token, self._sep_token)
-            split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self._sep_token)))
-            if clean_up_tokenization_spaces:
-                clean_text = [self.clean_up_tokenization(text) for text in split_text]
-                return clean_text
-            else:
-                return split_text
+        if clean_up_tokenization_spaces:
+            clean_text = self.clean_up_tokenization(text)
+            return clean_text
         else:
-            if clean_up_tokenization_spaces:
-                clean_text = self.clean_up_tokenization(text)
-                return clean_text
-            else:
-                return text
+            return text
 
     @property
     def special_tokens_map(self):

From cd69bc9c8750a83d96bd16bffaea916e8d55a6f1 Mon Sep 17 00:00:00 2001
From: Dima Veselov <d.a.veselov@yandex.ru>
Date: Wed, 2 Oct 2019 03:21:55 +0300
Subject: [PATCH 10/55] Fixed typo in docs README

---
 docs/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/README.md b/docs/README.md
index 87fa5b90a0..de37f7cba1 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -50,7 +50,7 @@ make html
 ---
 **NOTE**
 
-If you are adding/removing elements from the toc-tree or from any strutural item, it is recommended to clean the build
+If you are adding/removing elements from the toc-tree or from any structural item, it is recommended to clean the build
 directory before rebuilding. Run the following command to clean and build:
 
 ```bash

From a95158518d65fe640ecb35813280609e27ba3ab7 Mon Sep 17 00:00:00 2001
From: danai-antoniou <danaiantoniou@monzo.com>
Date: Wed, 2 Oct 2019 07:44:15 +0100
Subject: [PATCH 11/55] Moved duplicate token check

---
 transformers/tokenization_utils.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index d8b3c0c74b..de3f48f4c3 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -508,14 +508,12 @@ class PreTrainedTokenizer(object):
         if not new_tokens:
             return 0
 
-        if len(new_tokens) != len(set(new_tokens)):
-            raise ValueError("The provided list of tokens contains duplicates.")
-
         to_add_tokens = []
         for token in new_tokens:
             assert isinstance(token, str) or (six.PY2 and isinstance(token, unicode))
             if token != self.unk_token and \
-                    self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token):
+                    self.convert_tokens_to_ids(token) == self.convert_tokens_to_ids(self.unk_token) and \
+                    token not in to_add_tokens:
                 to_add_tokens.append(token)
                 logger.info("Adding %s to the vocabulary", token)
 

From 63ed224b7c550ead5f9599187e665ded57ce80d4 Mon Sep 17 00:00:00 2001
From: Santiago Castro <sacastro@umich.edu>
Date: Wed, 2 Oct 2019 11:02:08 -0400
Subject: [PATCH 12/55] initialy -> initially

---
 transformers/modeling_bert.py          | 2 +-
 transformers/modeling_tf_bert.py       | 2 +-
 transformers/modeling_tf_distilbert.py | 2 +-
 transformers/modeling_tf_xlm.py        | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 51e407d0a6..fc448fa366 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -118,7 +118,7 @@ def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
 
 
 def gelu(x):
-    """ Original Implementation of the gelu activation function in Google Bert repo when initialy created.
+    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
         Also see https://arxiv.org/abs/1606.08415
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index d763ca991e..4de94751f8 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -62,7 +62,7 @@ def load_bert_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
 
 def gelu(x):
     """ Gaussian Error Linear Unit.
-    Original Implementation of the gelu activation function in Google Bert repo when initialy created.
+    Original Implementation of the gelu activation function in Google Bert repo when initially created.
         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
         Also see https://arxiv.org/abs/1606.08415
diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py
index 2a917a30a4..5ce1616bcc 100644
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -45,7 +45,7 @@ TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
 ### UTILS AND BUILDING BLOCKS OF THE ARCHITECTURE ###
 def gelu(x):
     """ Gaussian Error Linear Unit.
-    Original Implementation of the gelu activation function in Google Bert repo when initialy created.
+    Original Implementation of the gelu activation function in Google Bert repo when initially created.
         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
         Also see https://arxiv.org/abs/1606.08415
diff --git a/transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py
index f8f199bbe6..83cc37c6a7 100644
--- a/transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -69,7 +69,7 @@ def create_sinusoidal_embeddings(n_pos, dim, out):
 
 def gelu(x):
     """ Gaussian Error Linear Unit.
-    Original Implementation of the gelu activation function in Google Bert repo when initialy created.
+    Original Implementation of the gelu activation function in Google Bert repo when initially created.
         For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
         0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
         Also see https://arxiv.org/abs/1606.08415

From ebb32261b19eaa258f998d2725116fe7a08224a6 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 2 Oct 2019 17:52:56 -0400
Subject: [PATCH 13/55] fix #1401

---
 docs/source/pretrained_models.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index 4c17b35c84..c12a9bc52f 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -98,6 +98,12 @@ Here is the full list of the currently provided pretrained models together with
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``xlm-clm-ende-1024``                                      | | 6-layer, 1024-hidden, 8-heads                                                                                                       |
 |                   |                                                            | | XLM English-German model trained with CLM (Causal Language Modeling) on the concatenation of English and German wikipedia           |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-17-1280``                                        | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
+|                   |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 17 languages.                                                              |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-100-1280``                                       | | 16-layer, 1280-hidden, 16-heads                                                                                                     |
+|                   |                                                            | | XLM model trained with MLM (Masked Language Modeling) on 100 languages.                                                             |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | RoBERTa           | ``roberta-base``                                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
 |                   |                                                            | | RoBERTa using the BERT-base architecture                                                                                            |

From 2195c0d5f9464385d14c43605a7e9f9a93a91b9a Mon Sep 17 00:00:00 2001
From: Brian Ma <brian41005@gmail.com>
Date: Thu, 3 Oct 2019 12:49:12 +0800
Subject: [PATCH 14/55] Evaluation result.txt path changing #1286

---
 examples/run_glue.py            | 6 ++++--
 examples/run_lm_finetuning.py   | 6 ++++--
 examples/run_multiple_choice.py | 8 ++++++--
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index fc3b617da0..3d16c63829 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -248,7 +248,7 @@ def evaluate(args, model, tokenizer, prefix=""):
         result = compute_metrics(eval_task, preds, out_label_ids)
         results.update(result)
 
-        output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
+        output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
         with open(output_eval_file, "w") as writer:
             logger.info("***** Eval results {} *****".format(prefix))
             for key in sorted(result.keys()):
@@ -489,9 +489,11 @@ def main():
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=global_step)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
             result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
             results.update(result)
 
diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index a91deebb6c..c167703d7b 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -282,7 +282,7 @@ def evaluate(args, model, tokenizer, prefix=""):
         "perplexity": perplexity
     }
 
-    output_eval_file = os.path.join(eval_output_dir, "eval_results.txt")
+    output_eval_file = os.path.join(eval_output_dir, prefix, "eval_results.txt")
     with open(output_eval_file, "w") as writer:
         logger.info("***** Eval results {} *****".format(prefix))
         for key in sorted(result.keys()):
@@ -484,9 +484,11 @@ def main():
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=global_step)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
             result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
             results.update(result)
 
diff --git a/examples/run_multiple_choice.py b/examples/run_multiple_choice.py
index 54f3a8a904..a983daad76 100644
--- a/examples/run_multiple_choice.py
+++ b/examples/run_multiple_choice.py
@@ -512,9 +512,11 @@ def main():
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=global_step)
+            result = evaluate(args, model, tokenizer, prefix=prefix)
             result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
             results.update(result)
 
@@ -528,9 +530,11 @@ def main():
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
         for checkpoint in checkpoints:
             global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            prefix = checkpoint.split('/')[-1] if checkpoint.find('checkpoint') != -1 else ""
+            
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
-            result = evaluate(args, model, tokenizer, prefix=global_step, test=True)
+            result = evaluate(args, model, tokenizer, prefix=prefix, test=True)
             result = dict((k + '_{}'.format(global_step), v) for k, v in result.items())
             results.update(result)
     if best_steps:

From 38084507c45c784dd5041058b8aa1676a633a18c Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 2 Oct 2019 11:00:46 -0400
Subject: [PATCH 15/55] add distillation_configs

---
 .../training_configs/distilbert-base-uncased.json | 15 +++++++++++++++
 .../distillation/training_configs/distilgpt2.json | 10 ++++++++++
 2 files changed, 25 insertions(+)
 create mode 100644 examples/distillation/training_configs/distilbert-base-uncased.json
 create mode 100644 examples/distillation/training_configs/distilgpt2.json

diff --git a/examples/distillation/training_configs/distilbert-base-uncased.json b/examples/distillation/training_configs/distilbert-base-uncased.json
new file mode 100644
index 0000000000..15d1e7fe00
--- /dev/null
+++ b/examples/distillation/training_configs/distilbert-base-uncased.json
@@ -0,0 +1,15 @@
+{
+	"activation": "gelu",
+	"attention_dropout": 0.1,
+	"dim": 768,
+	"dropout": 0.1,
+	"hidden_dim": 3072,
+	"initializer_range": 0.02,
+	"max_position_embeddings": 512,
+	"n_heads": 12,
+	"n_layers": 6,
+	"sinusoidal_pos_embds": true,
+	"tie_weights_": true,
+	"vocab_size": 30522
+  }
+  
\ No newline at end of file
diff --git a/examples/distillation/training_configs/distilgpt2.json b/examples/distillation/training_configs/distilgpt2.json
new file mode 100644
index 0000000000..8616e8e60f
--- /dev/null
+++ b/examples/distillation/training_configs/distilgpt2.json
@@ -0,0 +1,10 @@
+{
+	"initializer_range": 0.02,
+	"layer_norm_epsilon": 0.00001,
+	"n_ctx": 1024,
+	"n_embd": 768,
+	"n_head": 12,
+	"n_layer": 6,
+	"n_positions": 1024,
+	"vocab_size": 50257
+}
\ No newline at end of file

From 594202a9348d7c2f27f7deaf1a7308e3751b3fbc Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 2 Oct 2019 11:00:57 -0400
Subject: [PATCH 16/55] lm_seqs_dataset

---
 .../{dataset.py => lm_seqs_dataset.py}        | 124 ++++++------------
 1 file changed, 37 insertions(+), 87 deletions(-)
 rename examples/distillation/{dataset.py => lm_seqs_dataset.py} (54%)

diff --git a/examples/distillation/dataset.py b/examples/distillation/lm_seqs_dataset.py
similarity index 54%
rename from examples/distillation/dataset.py
rename to examples/distillation/lm_seqs_dataset.py
index 4babf73ea4..54e9742ce8 100644
--- a/examples/distillation/dataset.py
+++ b/examples/distillation/lm_seqs_dataset.py
@@ -12,30 +12,33 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Dataloaders to train DistilBERT
+""" Dataset to distilled models
     adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
-from typing import List
-import math
-from itertools import chain
-from collections import Counter
-import numpy as np
 import torch
+from torch.utils.data import Dataset
 
+import numpy as np
 from utils import logger
 
-class Dataset:
+class LmSeqsDataset(Dataset):
+    """Custom Dataset wrapping language modeling sequences.
+
+    Each sample will be retrieved by indexing the list of token_ids and their corresponding lengths.
+
+    Input:
+    ------
+        params: `NameSpace` parameters
+        data: `List[np.array[int]]
+    """
+
     def __init__(self,
                  params,
                  data):
         self.params = params
-        self.tokens_per_batch = params.tokens_per_batch
-        self.batch_size = params.batch_size
-        self.shuffle = params.shuffle
-        self.group_by_size = params.group_by_size
 
         self.token_ids = np.array(data)
-        self.lengths = np.uint16([len(t) for t in data])
+        self.lengths = np.array([len(t) for t in data])
 
         self.check()
         self.remove_long_sequences()
@@ -43,6 +46,9 @@ class Dataset:
         self.check()
         self.print_statistics()
 
+    def __getitem__(self, index):
+        return (self.token_ids[index], self.lengths[index])
+
     def __len__(self):
         return len(self.lengths)
 
@@ -51,12 +57,14 @@ class Dataset:
         Some sanity checks
         """
         assert len(self.token_ids) == len(self.lengths)
+        assert all(self.lengths[i] == len(self.token_ids[i]) for i in range(len(self.lengths))) 
 
     def remove_long_sequences(self):
         """
-        Sequences that are too long are splitted by chunk of max_position_embeddings.
+        Sequences that are too long are splitted by chunk of max_model_input_size.
         """
-        indices = self.lengths >= self.params.max_position_embeddings
+        max_len = self.params.max_model_input_size
+        indices = self.lengths > max_len
         logger.info(f'Splitting {sum(indices)} too long sequences.')
 
         def divide_chunks(l, n):
@@ -64,10 +72,13 @@ class Dataset:
 
         new_tok_ids = []
         new_lengths = []
-        cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
-        max_len = self.params.max_position_embeddings
+        if self.params.mlm:
+            cls_id, sep_id = self.params.special_tok_ids['cls_token'], self.params.special_tok_ids['sep_token']
+        else:
+            cls_id, sep_id = self.params.special_tok_ids['bos_token'], self.params.special_tok_ids['eos_token']
 
         for seq_, len_ in zip(self.token_ids, self.lengths):
+            assert (seq_[0] == cls_id) and (seq_[-1] == sep_id), seq_
             if len_ <= max_len:
                 new_tok_ids.append(seq_)
                 new_lengths.append(len_)
@@ -79,6 +90,7 @@ class Dataset:
                     if sub_s[-1] != sep_id:
                         sub_s = np.insert(sub_s, len(sub_s), sep_id)
                     assert len(sub_s) <= max_len
+                    assert (sub_s[0] == cls_id) and (sub_s[-1] == sep_id), sub_s
                     sub_seqs.append(sub_s)
 
                 new_tok_ids.extend(sub_seqs)
@@ -113,89 +125,27 @@ class Dataset:
         # nb_unkown = sum([(t==unk_idx).sum() for t in self.token_ids])
         # logger.info(f'{nb_unkown} unknown tokens (covering {100*nb_unkown/data_len:.2f}% of the data)')
 
-    def select_data(self, a: int, b: int):
-        """
-        Select a subportion of the data.
-        """
-        n_sequences = len(self)
-        assert 0 <= a < b <= n_sequences, ValueError(f'`0 <= a < b <= n_sequences` is not met with a={a} and b={b}')
-
-        logger.info(f'Selecting sequences from {a} to {b} (excluded).')
-        self.token_ids = self.token_ids[a:b]
-        self.lengths = self.lengths[a:b]
-
-        self.check()
-
-    def split(self):
-        """
-        Distributed training: split the data accross the processes.
-        """
-        assert self.params.n_gpu > 1
-        logger.info('Splitting the data accross the processuses.')
-        n_seq = len(self)
-        n_seq_per_procesus = n_seq // self.params.world_size
-        a = n_seq_per_procesus * self.params.global_rank
-        b = a + n_seq_per_procesus
-        self.select_data(a=a, b=b)
-
     def batch_sequences(self,
-                        token_ids: List[List[int]],
-                        lengths: List[int]):
+                        batch):
         """
         Do the padding and transform into torch.tensor.
         """
+        token_ids = [t[0] for t in batch]
+        lengths = [t[1] for t in batch]
         assert len(token_ids) == len(lengths)
 
         # Max for paddings
         max_seq_len_ = max(lengths)
 
         # Pad token ids
-        pad_idx = self.params.special_tok_ids['pad_token']
+        if self.params.mlm:
+            pad_idx = self.params.special_tok_ids['pad_token']
+        else:
+            pad_idx = self.params.special_tok_ids['unk_token']
         tk_ = [list(t.astype(int)) + [pad_idx]*(max_seq_len_-len(t)) for t in token_ids]
         assert len(tk_) == len(token_ids)
         assert all(len(t) == max_seq_len_ for t in tk_)
 
-        tk_t = torch.tensor(tk_)                  # (bs, max_seq_len_)
-        lg_t = torch.tensor(lengths.astype(int))  # (bs)
+        tk_t = torch.tensor(tk_)      # (bs, max_seq_len_)
+        lg_t = torch.tensor(lengths)  # (bs)
         return tk_t, lg_t
-
-    def get_batches_iterator(self,
-                             batches):
-        """
-        Return an iterator over batches.
-        """
-        for sequences_ids in batches:
-            token_ids, lengths = self.batch_sequences(self.token_ids[sequences_ids],
-                                                    self.lengths[sequences_ids])
-            yield (token_ids, lengths)
-
-    def get_iterator(self,
-                     seed: int = None):
-        """
-        Return a data iterator.
-        """
-        rng = np.random.RandomState(seed)
-
-        n_sequences = len(self)
-        indices = np.arange(n_sequences)
-
-        if self.group_by_size:
-            indices = indices[np.argsort(self.lengths[indices], kind='mergesort')]
-
-        if self.tokens_per_batch == -1:
-            batches = np.array_split(indices, math.ceil(len(indices) * 1. / self.batch_size))
-        else:
-            assert self.tokens_per_batch > 0
-            batch_ids = np.cumsum(self.lengths[indices]) // self.tokens_per_batch
-            _, bounds = np.unique(batch_ids, return_index=True)
-            batches = [indices[bounds[i]:bounds[i + 1]] for i in range(len(bounds) - 1)]
-            if bounds[-1] < len(indices):
-                batches.append(indices[bounds[-1]:])
-
-        if self.shuffle:
-            rng.shuffle(batches)
-
-        assert n_sequences == sum([len(x) for x in batches])
-        assert self.lengths[indices].sum() == sum([self.lengths[x].sum() for x in batches])
-
-        return self.get_batches_iterator(batches=batches)

From 19e4ebbe3fcded8a345fed05d9c3644b78312839 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 2 Oct 2019 11:01:07 -0400
Subject: [PATCH 17/55] grouped_batch_sampler

---
 .../distillation/grouped_batch_sampler.py     | 105 ++++++++++++++++++
 1 file changed, 105 insertions(+)
 create mode 100644 examples/distillation/grouped_batch_sampler.py

diff --git a/examples/distillation/grouped_batch_sampler.py b/examples/distillation/grouped_batch_sampler.py
new file mode 100644
index 0000000000..46d943a3d4
--- /dev/null
+++ b/examples/distillation/grouped_batch_sampler.py
@@ -0,0 +1,105 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Adapted from PyTorch Vision (https://github.com/pytorch/vision/blob/master/references/detection/group_by_aspect_ratio.py)
+"""
+import bisect
+import copy
+from collections import defaultdict
+import numpy as np
+
+from torch.utils.data.sampler import BatchSampler, Sampler
+
+from utils import logger
+
+def _quantize(x, bins):
+    bins = copy.deepcopy(bins)
+    bins = sorted(bins)
+    quantized = list(map(lambda y: bisect.bisect_right(bins, y), x))
+    return quantized
+
+def create_lengths_groups(lengths, k=0):
+    bins = np.arange(start=3, stop=k, step=4).tolist() if k > 0 else [10]
+    groups = _quantize(lengths, bins)
+    # count number of elements per group
+    counts = np.unique(groups, return_counts=True)[1]
+    fbins = [0] + bins + [np.inf]
+    logger.info("Using {} as bins for aspect lengths quantization".format(fbins))
+    logger.info("Count of instances per bin: {}".format(counts))
+    return groups
+
+class GroupedBatchSampler(BatchSampler):
+    """
+    Wraps another sampler to yield a mini-batch of indices.
+    It enforces that the batch only contain elements from the same group.
+    It also tries to provide mini-batches which follows an ordering which is
+    as close as possible to the ordering from the original sampler.
+    Arguments:
+        sampler (Sampler): Base sampler.
+        group_ids (list[int]): If the sampler produces indices in range [0, N),
+            `group_ids` must be a list of `N` ints which contains the group id of each sample.
+            The group ids must be a continuous set of integers starting from
+            0, i.e. they must be in the range [0, num_groups).
+        batch_size (int): Size of mini-batch.
+    """
+    def __init__(self, sampler, group_ids, batch_size):
+        if not isinstance(sampler, Sampler):
+            raise ValueError(
+                "sampler should be an instance of "
+                "torch.utils.data.Sampler, but got sampler={}".format(sampler)
+            )
+        self.sampler = sampler
+        self.group_ids = group_ids
+        self.batch_size = batch_size
+
+    def __iter__(self):
+        buffer_per_group = defaultdict(list)
+        samples_per_group = defaultdict(list)
+
+        num_batches = 0
+        for idx in self.sampler:
+            group_id = self.group_ids[idx]
+            buffer_per_group[group_id].append(idx)
+            samples_per_group[group_id].append(idx)
+            if len(buffer_per_group[group_id]) == self.batch_size:
+                yield buffer_per_group[group_id] #TODO
+                num_batches += 1
+                del buffer_per_group[group_id]
+            assert len(buffer_per_group[group_id]) < self.batch_size
+
+        # now we have run out of elements that satisfy
+        # the group criteria, let's return the remaining
+        # elements so that the size of the sampler is
+        # deterministic
+        expected_num_batches = len(self)
+        num_remaining = expected_num_batches - num_batches
+        if num_remaining > 0:
+            # for the remaining batches, group the batches by similar lengths
+            batch_idx = []
+            for group_id, idxs in sorted(buffer_per_group.items(), key=lambda x: x[0]):
+                batch_idx.extend(idxs)
+                if len(batch_idx) >= self.batch_size:
+                    yield batch_idx[:self.batch_size]
+                    batch_idx = batch_idx[self.batch_size:]
+                    num_remaining -= 1
+            if len(batch_idx) > 0:
+                yield batch_idx
+                num_remaining -= 1
+        assert num_remaining == 0
+
+    def __len__(self):
+        """
+        Return the number of mini-batches rather than the number of samples.
+        """
+        return (len(self.sampler) + self.batch_size - 1) // self.batch_size

From cbfcfce205d754f2019b6a795d6c7939ddbf58ba Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 2 Oct 2019 11:01:20 -0400
Subject: [PATCH 18/55] update token_counts

---
 examples/distillation/scripts/token_counts.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/distillation/scripts/token_counts.py b/examples/distillation/scripts/token_counts.py
index a484a6f51b..d9de17da4e 100644
--- a/examples/distillation/scripts/token_counts.py
+++ b/examples/distillation/scripts/token_counts.py
@@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Preprocessing script before training DistilBERT.
+Preprocessing script before training the distilled model.
 """
 from collections import Counter
 import argparse

From 23edebc0797008f0525fd1eef7f1299b513457ad Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 2 Oct 2019 11:01:33 -0400
Subject: [PATCH 19/55] update extract_distilbert

---
 ...ct_for_distil.py => extract_distilbert.py} | 28 +++++++------------
 1 file changed, 10 insertions(+), 18 deletions(-)
 rename examples/distillation/scripts/{extract_for_distil.py => extract_distilbert.py} (76%)

diff --git a/examples/distillation/scripts/extract_for_distil.py b/examples/distillation/scripts/extract_distilbert.py
similarity index 76%
rename from examples/distillation/scripts/extract_for_distil.py
rename to examples/distillation/scripts/extract_distilbert.py
index 2e7e5c73d8..fdb0662ca7 100644
--- a/examples/distillation/scripts/extract_for_distil.py
+++ b/examples/distillation/scripts/extract_distilbert.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 """
 Preprocessing script before training DistilBERT.
+Specific to BERT -> DistilBERT.
 """
 from transformers import BertForMaskedLM, RobertaForMaskedLM
 import torch
@@ -21,7 +22,7 @@ import argparse
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(description="Extraction some layers of the full BertForMaskedLM or RObertaForMaskedLM for Transfer Learned Distillation")
-    parser.add_argument("--model_type", default="bert", choices=["bert", "roberta"])
+    parser.add_argument("--model_type", default="bert", choices=["bert"])
     parser.add_argument("--model_name", default='bert-base-uncased', type=str)
     parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_bert-base-uncased_0247911.pth', type=str)
     parser.add_argument("--vocab_transform", action='store_true')
@@ -31,9 +32,8 @@ if __name__ == '__main__':
     if args.model_type == 'bert':
         model = BertForMaskedLM.from_pretrained(args.model_name)
         prefix = 'bert'
-    elif args.model_type == 'roberta':
-        model = RobertaForMaskedLM.from_pretrained(args.model_name)
-        prefix = 'roberta'
+    else:
+        raise ValueError(f'args.model_type should be "bert".')
 
     state_dict = model.state_dict()
     compressed_sd = {}
@@ -68,20 +68,12 @@ if __name__ == '__main__':
                 state_dict[f'{prefix}.encoder.layer.{teacher_idx}.output.LayerNorm.{w}']
         std_idx += 1
 
-    if args.model_type == 'bert':
-        compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
-        compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
-        if args.vocab_transform:
-            for w in ['weight', 'bias']:
-                compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
-                compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
-    elif args.model_type == 'roberta':
-        compressed_sd[f'vocab_projector.weight'] = state_dict[f'lm_head.decoder.weight']
-        compressed_sd[f'vocab_projector.bias'] = state_dict[f'lm_head.bias']
-        if args.vocab_transform:
-            for w in ['weight', 'bias']:
-                compressed_sd[f'vocab_transform.{w}'] = state_dict[f'lm_head.dense.{w}']
-                compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
+    compressed_sd[f'vocab_projector.weight'] = state_dict[f'cls.predictions.decoder.weight']
+    compressed_sd[f'vocab_projector.bias'] = state_dict[f'cls.predictions.bias']
+    if args.vocab_transform:
+        for w in ['weight', 'bias']:
+            compressed_sd[f'vocab_transform.{w}'] = state_dict[f'cls.predictions.transform.dense.{w}']
+            compressed_sd[f'vocab_layer_norm.{w}'] = state_dict[f'cls.predictions.transform.LayerNorm.{w}']
 
     print(f'N layers selected for distillation: {std_idx}')
     print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')

From 4d6dfbd3762ac57e44d97b1c6d0243cfffd1880b Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 2 Oct 2019 11:01:41 -0400
Subject: [PATCH 20/55] update extract

---
 examples/distillation/scripts/extract.py | 89 ++++++++++++++++++++++++
 1 file changed, 89 insertions(+)
 create mode 100644 examples/distillation/scripts/extract.py

diff --git a/examples/distillation/scripts/extract.py b/examples/distillation/scripts/extract.py
new file mode 100644
index 0000000000..5ae1607f3f
--- /dev/null
+++ b/examples/distillation/scripts/extract.py
@@ -0,0 +1,89 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Preprocessing script before training the distilled model.
+Specific to RoBERTa -> DistilRoBERTa and GPT2 -> DistilGPT2.
+"""
+from transformers import BertForMaskedLM, RobertaForMaskedLM, GPT2LMHeadModel
+import torch
+import argparse
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description="Extraction some layers of the full RobertaForMaskedLM or GPT2LMHeadModel for Transfer Learned Distillation")
+    parser.add_argument("--model_type", default="roberta", choices=["roberta", "gpt2"])
+    parser.add_argument("--model_name", default='roberta-large', type=str)
+    parser.add_argument("--dump_checkpoint", default='serialization_dir/tf_roberta_048131723.pth', type=str)
+    parser.add_argument("--vocab_transform", action='store_true')
+    args = parser.parse_args()
+
+
+    if args.model_type == 'roberta':
+        model = RobertaForMaskedLM.from_pretrained(args.model_name)
+        prefix = 'roberta'
+    elif args.model_type == 'gpt2':
+        model = GPT2LMHeadModel.from_pretrained(args.model_name)
+        prefix = 'transformer'
+
+    state_dict = model.state_dict()
+    compressed_sd = {}
+
+    ### Embeddings ###
+    if args.model_type == 'gpt2':
+        for param_name in ['wte.weight', 'wpe.weight']:
+            compressed_sd[f'{prefix}.{param_name}'] = state_dict[f'{prefix}.{param_name}']
+    else:
+        for w in ['word_embeddings', 'position_embeddings', 'token_type_embeddings']:
+            param_name = f'{prefix}.embeddings.{w}.weight'
+            compressed_sd[param_name] = state_dict[param_name]
+        for w in ['weight', 'bias']:
+            param_name = f'{prefix}.embeddings.LayerNorm.{w}'
+            compressed_sd[param_name] = state_dict[param_name]
+
+    ### Transformer Blocks ###
+    std_idx = 0
+    for teacher_idx in [0, 2, 4, 7, 9, 11]:
+        if args.model_type == 'gpt2':
+            for layer in ['ln_1', 'attn.c_attn', 'attn.c_proj', 'ln_2', 'mlp.c_fc', 'mlp.c_proj']:
+                for w in ['weight', 'bias']:
+                    compressed_sd[f'{prefix}.h.{std_idx}.{layer}.{w}'] = \
+                        state_dict[f'{prefix}.h.{teacher_idx}.{layer}.{w}']
+            compressed_sd[f'{prefix}.h.{std_idx}.attn.bias'] = state_dict[f'{prefix}.h.{teacher_idx}.attn.bias']
+        else:
+            for layer in ['attention.self.query', 'attention.self.key', 'attention.self.value',
+                        'attention.output.dense', 'attention.output.LayerNorm',
+                        'intermediate.dense', 'output.dense', 'output.LayerNorm']:
+                for w in ['weight', 'bias']:
+                    compressed_sd[f'{prefix}.encoder.layer.{std_idx}.{layer}.{w}'] = \
+                        state_dict[f'{prefix}.encoder.layer.{teacher_idx}.{layer}.{w}']
+        std_idx += 1
+
+    ### Language Modeling Head ###s
+    if args.model_type == 'roberta':
+        for layer in ['lm_head.decoder.weight', 'lm_head.bias']:
+            compressed_sd[f'{layer}'] = state_dict[f'{layer}']
+        if args.vocab_transform:
+            for w in ['weight', 'bias']:
+                compressed_sd[f'lm_head.dense.{w}'] = state_dict[f'lm_head.dense.{w}']
+                compressed_sd[f'lm_head.layer_norm.{w}'] = state_dict[f'lm_head.layer_norm.{w}']
+    elif args.model_type == 'gpt2':
+        for w in ['weight', 'bias']:
+            compressed_sd[f'{prefix}.ln_f.{w}'] = state_dict[f'{prefix}.ln_f.{w}']
+        compressed_sd[f'lm_head.weight'] = state_dict[f'lm_head.weight']
+
+    print(f'N layers selected for distillation: {std_idx}')
+    print(f'Number of params transfered for distillation: {len(compressed_sd.keys())}')
+
+    print(f'Save transfered checkpoint to {args.dump_checkpoint}.')
+    torch.save(compressed_sd, args.dump_checkpoint)

From a12ab0a8dba640730b5353d07ca8893c2d64688f Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 2 Oct 2019 11:01:55 -0400
Subject: [PATCH 21/55] update binarized_data

---
 examples/distillation/scripts/binarized_data.py | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/examples/distillation/scripts/binarized_data.py b/examples/distillation/scripts/binarized_data.py
index eb4af08b0f..43824e9964 100644
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -13,14 +13,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Preprocessing script before training DistilBERT.
+Preprocessing script before distillation.
 """
 import argparse
 import pickle
 import random
 import time
 import numpy as np
-from transformers import BertTokenizer, RobertaTokenizer
+from transformers import BertTokenizer, RobertaTokenizer, GPT2Tokenizer
 import logging
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
@@ -32,7 +32,7 @@ def main():
     parser = argparse.ArgumentParser(description="Preprocess the data to avoid re-doing it several times by (tokenization + token_to_ids).")
     parser.add_argument('--file_path', type=str, default='data/dump.txt',
                         help='The path to the data.')
-    parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta'])
+    parser.add_argument('--tokenizer_type', type=str, default='bert', choices=['bert', 'roberta', 'gpt2'])
     parser.add_argument('--tokenizer_name', type=str, default='bert-base-uncased',
                         help="The tokenizer to use.")
     parser.add_argument('--dump_file', type=str, default='data/dump',
@@ -43,10 +43,16 @@ def main():
     logger.info(f'Loading Tokenizer ({args.tokenizer_name})')
     if args.tokenizer_type == 'bert':
         tokenizer = BertTokenizer.from_pretrained(args.tokenizer_name)
+        bos = tokenizer.special_tokens_map['cls_token'] # `[CLS]`
+        sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]`
     elif args.tokenizer_type == 'roberta':
         tokenizer = RobertaTokenizer.from_pretrained(args.tokenizer_name)
-    bos = tokenizer.special_tokens_map['bos_token'] # `[CLS]` for bert, `<s>` for roberta
-    sep = tokenizer.special_tokens_map['sep_token'] # `[SEP]` for bert, `</s>` for roberta
+        bos = tokenizer.special_tokens_map['cls_token'] # `<s>`
+        sep = tokenizer.special_tokens_map['sep_token'] # `</s>`
+    elif args.tokenizer_type == 'gpt2':
+        tokenizer = GPT2Tokenizer.from_pretrained(args.tokenizer_name)
+        bos = tokenizer.special_tokens_map['bos_token'] # `<|endoftext|>`
+        sep = tokenizer.special_tokens_map['eos_token'] # `<|endoftext|>`    
 
     logger.info(f'Loading text from {args.file_path}')
     with open(args.file_path, 'r', encoding='utf8') as fp:

From bb9c5ead5444d7510e4734f540bf87fa9c5669fb Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 2 Oct 2019 11:02:30 -0400
Subject: [PATCH 22/55] update distiller

---
 examples/distillation/distiller.py | 187 ++++++++++++++++++-----------
 1 file changed, 117 insertions(+), 70 deletions(-)

diff --git a/examples/distillation/distiller.py b/examples/distillation/distiller.py
index 79755b81e0..f736936449 100644
--- a/examples/distillation/distiller.py
+++ b/examples/distillation/distiller.py
@@ -12,8 +12,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" The distiller to distil DistilBERT
-    adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
+""" The distiller to distil the student.
+    Adapted in part from Facebook, Inc XLM model (https://github.com/facebookresearch/XLM)
 """
 import os
 import math
@@ -28,16 +28,19 @@ import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch.optim import AdamW
+from torch.utils.data.distributed import DistributedSampler
+from torch.utils.data import RandomSampler, BatchSampler, DataLoader
 
 from transformers import WarmupLinearSchedule
 
 from utils import logger
-from dataset import Dataset
+from lm_seqs_dataset import LmSeqsDataset
+from grouped_batch_sampler import GroupedBatchSampler, create_lengths_groups
 
 class Distiller:
     def __init__(self,
                  params: dict,
-                 dataloader: Dataset,
+                 dataset: LmSeqsDataset,
                  token_probs: torch.tensor,
                  student: nn.Module,
                  teacher: nn.Module):
@@ -50,33 +53,47 @@ class Distiller:
         self.student = student
         self.teacher = teacher
 
-        self.dataloader = dataloader
-        if self.params.n_gpu > 1:
-            self.dataloader.split()
-        self.get_iterator(seed=params.seed)
+        self.student_config = student.config
+        self.vocab_size = student.config.vocab_size
+
+        if params.n_gpu <= 1:
+            sampler = RandomSampler(dataset)
+        else:
+            sampler = DistributedSampler(dataset)
+
+        if params.group_by_size:
+            groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size)
+            sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size)
+        else:
+            sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)
+
+        self.dataloader = DataLoader(dataset=dataset,
+                                     batch_sampler=sampler,
+                                     collate_fn=dataset.batch_sequences)
 
         self.temperature = params.temperature
         assert self.temperature > 0.
 
         self.alpha_ce = params.alpha_ce
         self.alpha_mlm = params.alpha_mlm
+        self.alpha_clm = params.alpha_clm
         self.alpha_mse = params.alpha_mse
         self.alpha_cos = params.alpha_cos
-        assert self.alpha_ce >= 0.
-        assert self.alpha_mlm >= 0.
-        assert self.alpha_mse >= 0.
-        assert self.alpha_cos >= 0.
-        assert self.alpha_ce + self.alpha_mlm + self.alpha_mse + self.alpha_cos > 0.
 
-        self.mlm_mask_prop = params.mlm_mask_prop
-        assert 0.0 <= self.mlm_mask_prop <= 1.0
-        assert params.word_mask + params.word_keep + params.word_rand == 1.0
-        self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
-        self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
-        self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
-        if self.fp16:
-            self.pred_probs = self.pred_probs.half()
-            self.token_probs = self.token_probs.half()
+        self.mlm = params.mlm
+        if self.mlm:
+            logger.info(f'Using MLM loss for LM step.')
+            self.mlm_mask_prop = params.mlm_mask_prop
+            assert 0.0 <= self.mlm_mask_prop <= 1.0
+            assert params.word_mask + params.word_keep + params.word_rand == 1.0
+            self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
+            self.pred_probs = self.pred_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else self.pred_probs
+            self.token_probs = token_probs.to(f'cuda:{params.local_rank}') if params.n_gpu > 0 else token_probs
+            if self.fp16:
+                self.pred_probs = self.pred_probs.half()
+                self.token_probs = self.token_probs.half()
+        else:
+            logger.info(f'Using CLM loss for LM step.')
 
         self.epoch = 0
         self.n_iter = 0
@@ -86,12 +103,13 @@ class Distiller:
         self.last_loss = 0
         self.last_loss_ce = 0
         self.last_loss_mlm = 0
+        self.last_loss_clm = 0
         if self.alpha_mse > 0.: self.last_loss_mse = 0
         if self.alpha_cos > 0.: self.last_loss_cos = 0
         self.last_log = 0
 
         self.ce_loss_fct = nn.KLDivLoss(reduction='batchmean')
-        self.mlm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
+        self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
         if self.alpha_mse > 0.:
             self.mse_loss_fct = nn.MSELoss(reduction='sum')
         if self.alpha_cos > 0.:
@@ -99,7 +117,7 @@ class Distiller:
 
         logger.info('--- Initializing model optimizer')
         assert params.gradient_accumulation_steps >= 1
-        self.num_steps_epoch = int(len(self.dataloader) / params.batch_size) + 1
+        self.num_steps_epoch = len(self.dataloader)
         num_train_optimization_steps = int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
 
         no_decay = ['bias', 'LayerNorm.weight']
@@ -140,43 +158,18 @@ class Distiller:
                 logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
                 self.student = DistributedDataParallel(self.student,
                                                        device_ids=[params.local_rank],
-                                                       output_device=params.local_rank)
+                                                       output_device=params.local_rank,
+                                                       find_unused_parameters=True)
 
         self.is_master = params.is_master
         if self.is_master:
             logger.info('--- Initializing Tensorboard')
             self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, 'log', 'train'))
-            self.tensorboard.add_text(tag='config', text_string=str(self.params), global_step=0)
+            self.tensorboard.add_text(tag='config/training', text_string=str(self.params), global_step=0)
+            self.tensorboard.add_text(tag='config/student', text_string=str(self.student_config), global_step=0)
 
-    def get_iterator(self,
-                     seed: int = None):
-        """
-        Initialize the data iterator.
-        Each process has its own data iterator (iterating on his own random portion of the dataset).
-
-        Input:
-        ------
-            seed: `int` - The random seed.
-        """
-        logger.info('--- Initializing Data Iterator')
-        self.data_iterator = self.dataloader.get_iterator(seed=seed)
-
-    def get_batch(self):
-        """
-        Call the data iterator to output a new batch.
-        If the data iterator went through the whole dataset, create a new iterator.
-        """
-        assert hasattr(self, 'data_iterator')
-        try:
-            x = next(self.data_iterator)
-        except StopIteration:
-            logger.warning('--- Went through the whole dataset. Creating new data iterator.')
-            self.data_iterator = self.dataloader.get_iterator()
-            x = next(self.data_iterator)
-        return x
-
-    def prepare_batch(self,
-                      batch):
+    def prepare_batch_mlm(self,
+                          batch):
         """
         Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the masked label for MLM.
 
@@ -222,7 +215,7 @@ class Distiller:
                 assert pred_mask.sum().item() % 8 == 0, pred_mask.sum().item()
 
         _token_ids_real = token_ids[pred_mask]
-        _token_ids_rand = _token_ids_real.clone().random_(self.params.vocab_size)
+        _token_ids_rand = _token_ids_real.clone().random_(self.vocab_size)
         _token_ids_mask = _token_ids_real.clone().fill_(self.params.special_tok_ids['mask_token'])
         probs = torch.multinomial(self.pred_probs, len(_token_ids_real), replacement=True)
         _token_ids = _token_ids_mask * (probs == 0).long() + _token_ids_real * (probs == 1).long() + _token_ids_rand * (probs == 2).long()
@@ -230,8 +223,41 @@ class Distiller:
 
         mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
 
+        # sanity checks
+        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
+
         return token_ids, attn_mask, mlm_labels
 
+    def prepare_batch_clm(self,
+                          batch):
+        """
+        Prepare the batch: from the token_ids and the lenghts, compute the attention mask and the labels for CLM.
+
+        Input:
+        ------
+            batch: `Tuple`
+                token_ids: `torch.tensor(bs, seq_length)` - The token ids for each of the sequence. It is padded.
+                lengths: `torch.tensor(bs)` - The lengths of each of the sequences in the batch.
+
+        Output:
+        -------
+            token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
+            attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
+            clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1 where there is nothing to predict.
+        """
+        token_ids, lengths = batch
+        token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
+        assert token_ids.size(0) == lengths.size(0)
+
+        attn_mask = (torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None])
+        clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
+        clm_labels[~attn_mask] = -1 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
+
+        # sanity checks
+        assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
+
+        return token_ids, attn_mask, clm_labels
+
     def round_batch(self,
                     x: torch.tensor,
                     lengths: torch.tensor):
@@ -269,7 +295,10 @@ class Distiller:
         if ml1 % 8 != 0:
             pad = 8 - (ml1 % 8)
             ml2 = ml1 + pad
-            pad_id = self.params.special_tok_ids['pad_token']
+            if self.mlm:
+                pad_id = self.params.special_tok_ids['pad_token']
+            else:
+                pad_id = self.params.special_tok_ids['unk_token']
             padding_tensor = torch.zeros(bs2, pad, dtype=torch.long, device=x.device).fill_(pad_id)
             x = torch.cat([x, padding_tensor], 1)
             assert x.size() == (bs2, ml2)
@@ -292,14 +321,16 @@ class Distiller:
             if self.multi_gpu:
                 torch.distributed.barrier()
 
-            iter_bar = trange(self.num_steps_epoch, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
-            for __ in range(self.num_steps_epoch):
-                batch = self.get_batch()
+            iter_bar = tqdm(self.dataloader, desc="-Iter", disable=self.params.local_rank not in [-1, 0])
+            for batch in iter_bar:
                 if self.params.n_gpu > 0:
                     batch = tuple(t.to(f'cuda:{self.params.local_rank}') for t in batch)
-                token_ids, attn_mask, mlm_labels = self.prepare_batch(batch=batch)
 
-                self.step(input_ids=token_ids, attention_mask=attn_mask, mlm_labels=mlm_labels)
+                if self.mlm:
+                    token_ids, attn_mask, lm_labels = self.prepare_batch_mlm(batch=batch)
+                else:
+                    token_ids, attn_mask, lm_labels = self.prepare_batch_clm(batch=batch)
+                self.step(input_ids=token_ids, attention_mask=attn_mask, lm_labels=lm_labels)
 
                 iter_bar.update()
                 iter_bar.set_postfix({'Last_loss': f'{self.last_loss:.2f}',
@@ -317,7 +348,7 @@ class Distiller:
     def step(self,
              input_ids: torch.tensor,
              attention_mask: torch.tensor,
-             mlm_labels: torch.tensor):
+             lm_labels: torch.tensor):
         """
         One optimization step: forward of student AND teacher, backward on the loss (for gradient accumulation),
         and possibly a parameter update (depending on the gradient accumulation).
@@ -326,17 +357,22 @@ class Distiller:
         ------
         input_ids: `torch.tensor(bs, seq_length)` - The token ids.
         attention_mask: `torch.tensor(bs, seq_length)` - The attention mask for self attention.
-        mlm_labels: `torch.tensor(bs, seq_length)` - The masked language modeling labels.
+        lm_labels: `torch.tensor(bs, seq_length)` - The language modeling labels (mlm labels for MLM and clm labels for CLM).
         """
-        s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask)     # (bs, seq_length, voc_size)
-        with torch.no_grad():
-            t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
+        if self.mlm:
+            s_logits, s_hidden_states = self.student(input_ids=input_ids, attention_mask=attention_mask)     # (bs, seq_length, voc_size)
+            with torch.no_grad():
+                t_logits, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=attention_mask) # (bs, seq_length, voc_size)
+        else:
+            s_logits, _, s_hidden_states = self.student(input_ids=input_ids, attention_mask=None)            # (bs, seq_length, voc_size)
+            with torch.no_grad():
+                t_logits, _, t_hidden_states = self.teacher(input_ids=input_ids, attention_mask=None)           # (bs, seq_length, voc_size)
         assert s_logits.size() == t_logits.size()
 
         #https://github.com/peterliht/knowledge-distillation-pytorch/blob/master/model/net.py#L100
         #https://github.com/peterliht/knowledge-distillation-pytorch/issues/2
         if self.params.restrict_ce_to_mask:
-            mask = (mlm_labels>-1).unsqueeze(-1).expand_as(s_logits)   # (bs, seq_lenth, voc_size)
+            mask = (lm_labels>-1).unsqueeze(-1).expand_as(s_logits)    # (bs, seq_lenth, voc_size)
         else:
             mask = attention_mask.unsqueeze(-1).expand_as(s_logits)    # (bs, seq_lenth, voc_size)
         s_logits_slct = torch.masked_select(s_logits, mask)            # (bs * seq_length * voc_size) modulo the 1s in mask
@@ -348,13 +384,20 @@ class Distiller:
         loss_ce = self.ce_loss_fct(F.log_softmax(s_logits_slct/self.temperature, dim=-1),
                                    F.softmax(t_logits_slct/self.temperature, dim=-1)) * (self.temperature)**2
         loss = self.alpha_ce*loss_ce
+
         if self.alpha_mlm > 0.:
-            loss_mlm = self.mlm_loss_fct(s_logits.view(-1, s_logits.size(-1)), mlm_labels.view(-1))
+            loss_mlm = self.lm_loss_fct(s_logits.view(-1, s_logits.size(-1)), lm_labels.view(-1))
             loss += self.alpha_mlm * loss_mlm
+        if self.alpha_clm > 0.:
+            shift_logits = s_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_clm = self.lm_loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                                        shift_labels.view(-1))
+            loss += self.alpha_clm * loss_clm
+
         if self.alpha_mse > 0.:
             loss_mse = self.mse_loss_fct(s_logits_slct, t_logits_slct)/s_logits_slct.size(0) # Reproducing batchmean reduction
             loss += self.alpha_mse * loss_mse
-        
         if self.alpha_cos > 0.:
             s_hidden_states = s_hidden_states[-1]                              # (bs, seq_length, dim)
             t_hidden_states = t_hidden_states[-1]                              # (bs, seq_length, dim)
@@ -376,6 +419,8 @@ class Distiller:
         self.last_loss_ce = loss_ce.item()
         if self.alpha_mlm > 0.:
             self.last_loss_mlm = loss_mlm.item()
+        if self.alpha_clm > 0.:
+            self.last_loss_clm = loss_clm.item()
         if self.alpha_mse > 0.:
             self.last_loss_mse = loss_mse.item()
         if self.alpha_cos > 0.:
@@ -452,6 +497,8 @@ class Distiller:
         self.tensorboard.add_scalar(tag="losses/loss_ce", scalar_value=self.last_loss_ce, global_step=self.n_total_iter)
         if self.alpha_mlm > 0.:
             self.tensorboard.add_scalar(tag="losses/loss_mlm", scalar_value=self.last_loss_mlm, global_step=self.n_total_iter)
+        if self.alpha_clm > 0.:
+            self.tensorboard.add_scalar(tag="losses/loss_clm", scalar_value=self.last_loss_clm, global_step=self.n_total_iter)
         if self.alpha_mse > 0.:
             self.tensorboard.add_scalar(tag="losses/loss_mse", scalar_value=self.last_loss_mse, global_step=self.n_total_iter)
         if self.alpha_cos > 0.:

From a76c3f9cb0fafc87dbed6f5a6b2b0bf3e3a00c03 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 2 Oct 2019 11:02:43 -0400
Subject: [PATCH 23/55] update requirements

---
 examples/distillation/requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/distillation/requirements.txt b/examples/distillation/requirements.txt
index 2cf6ee2d81..d76273b34a 100644
--- a/examples/distillation/requirements.txt
+++ b/examples/distillation/requirements.txt
@@ -3,4 +3,4 @@ tensorboard>=1.14.0
 tensorboardX==1.8
 psutil==5.6.3
 scipy==1.3.1
-pytorch_transformers==1.2.0
+transformers==2.0.0

From c51e533a5febe3fae2bb33b060f6b1f36a92e003 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 2 Oct 2019 11:02:53 -0400
Subject: [PATCH 24/55] update train.py

---
 examples/distillation/train.py | 193 ++++++++++++++++++++-------------
 1 file changed, 118 insertions(+), 75 deletions(-)

diff --git a/examples/distillation/train.py b/examples/distillation/train.py
index f0255d08fe..311f0580ff 100644
--- a/examples/distillation/train.py
+++ b/examples/distillation/train.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """
-Training DistilBERT.
+Training the distilled model.
+Supported architectures include: BERT -> DistilBERT, RoBERTa -> DistilRoBERTa, GPT2 -> DistilGPT2.
 """
 import os
 import argparse
@@ -23,68 +24,96 @@ import shutil
 import numpy as np
 import torch
 
-from transformers import BertTokenizer, BertForMaskedLM, RobertaTokenizer, RobertaForMaskedLM
-from transformers import DistilBertForMaskedLM, DistilBertConfig
+from transformers import BertConfig, BertForMaskedLM, BertTokenizer
+from transformers import RobertaConfig, RobertaForMaskedLM, RobertaTokenizer
+from transformers import DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer
+from transformers import GPT2Config, GPT2LMHeadModel, GPT2Tokenizer
 
 from distiller import Distiller
 from utils import git_log, logger, init_gpu_params, set_seed
-from dataset import Dataset
+from lm_seqs_dataset import LmSeqsDataset
 
 
+MODEL_CLASSES = {
+    'distilbert': (DistilBertConfig, DistilBertForMaskedLM, DistilBertTokenizer),
+    'roberta': (RobertaConfig, RobertaForMaskedLM, RobertaTokenizer),
+    'bert': (BertConfig, BertForMaskedLM, BertTokenizer),
+    'gpt2': (GPT2Config, GPT2LMHeadModel, GPT2Tokenizer)
+}
+
+def sanity_checks(args):
+    """
+    A bunch of args sanity checks to perform even starting...
+    """
+    assert (args.mlm and args.alpha_mlm > 0.) or (not args.mlm and args.alpha_mlm == 0.)
+    assert (args.alpha_mlm > 0. and args.alpha_clm == 0.) or (args.alpha_mlm == 0. and args.alpha_clm > 0.)
+    if args.mlm:
+        assert os.path.isfile(args.token_counts)
+        assert (args.student_type in ['roberta', 'distilbert']) and (args.teacher_type in ['roberta', 'bert'])
+    else:
+        assert (args.student_type in ['gpt2']) and (args.teacher_type in ['gpt2'])
+
+    assert args.teacher_type == args.student_type or (args.student_type=='distilbert' and args.teacher_type=='bert')
+    assert os.path.isfile(args.student_config)
+    if args.student_pretrained_weights is not None:
+        assert os.path.isfile(args.student_pretrained_weights)
+
+    if args.freeze_token_type_embds: assert args.student_type in ['roberta']
+
+    assert args.alpha_ce >= 0.
+    assert args.alpha_mlm >= 0.
+    assert args.alpha_clm >= 0.
+    assert args.alpha_mse >= 0.
+    assert args.alpha_cos >= 0.
+    assert args.alpha_ce + args.alpha_mlm + args.alpha_clm + args.alpha_mse + args.alpha_cos > 0.
+
+def freeze_pos_embeddings(student, args):
+    if args.student_type == 'roberta':
+        student.roberta.embeddings.position_embeddings.weight.requires_grad = False
+    elif args.student_type == 'gpt2':
+        student.transformer.wpe.weight.requires_grad = False
+
+def freeze_token_type_embeddings(student, args):
+    if args.student_type == 'roberta':
+        student.roberta.embeddings.token_type_embeddings.weight.requires_grad = False
+
 def main():
     parser = argparse.ArgumentParser(description="Training")
+    parser.add_argument("--force", action='store_true',
+                        help="Overwrite dump_path if it already exists.")
 
     parser.add_argument("--dump_path", type=str, required=True,
                         help="The output directory (log, checkpoints, parameters, etc.)")
     parser.add_argument("--data_file", type=str, required=True,
                         help="The binarized file (tokenized + tokens_to_ids) and grouped by sequence.")
-    parser.add_argument("--token_counts", type=str, required=True,
-                        help="The token counts in the data_file for MLM.")
-    parser.add_argument("--force", action='store_true',
-                        help="Overwrite dump_path if it already exists.")
 
-    parser.add_argument("--vocab_size", default=30522, type=int,
-                        help="The vocabulary size.")
-    parser.add_argument("--max_position_embeddings", default=512, type=int,
-                        help="Maximum sequence length we can model (including [CLS] and [SEP]).")
-    parser.add_argument("--sinusoidal_pos_embds", action='store_false',
-                        help="If true, the position embeddings are simply fixed with sinusoidal embeddings.")
-    parser.add_argument("--n_layers", default=6, type=int,
-                        help="Number of Transformer blocks.")
-    parser.add_argument("--n_heads", default=12, type=int,
-                        help="Number of heads in the self-attention module.")
-    parser.add_argument("--dim", default=768, type=int,
-                        help="Dimension through the network. Must be divisible by n_heads")
-    parser.add_argument("--hidden_dim", default=3072, type=int,
-                        help="Intermediate dimension in the FFN.")
-    parser.add_argument("--dropout", default=0.1, type=float,
-                        help="Dropout.")
-    parser.add_argument("--attention_dropout", default=0.1, type=float,
-                        help="Dropout in self-attention.")
-    parser.add_argument("--activation", default='gelu', type=str,
-                        help="Activation to use in self-attention")
-    parser.add_argument("--tie_weights_", action='store_false',
-                        help="If true, we tie the embeddings matrix with the projection over the vocabulary matrix. Default is true.")
-
-    parser.add_argument("--from_pretrained_weights", default=None, type=str,
+    parser.add_argument("--student_type", type=str, choices=["distilbert", "roberta", "gpt2"], required=True,
+                        help="The student type (DistilBERT, RoBERTa).")
+    parser.add_argument("--student_config", type=str, required=True,
+                        help="Path to the student configuration.")
+    parser.add_argument("--student_pretrained_weights", default=None, type=str,
                         help="Load student initialization checkpoint.")
-    parser.add_argument("--from_pretrained_config", default=None, type=str,
-                        help="Load student initialization architecture config.")
-    parser.add_argument("--teacher_type", default="bert", choices=["bert", "roberta"],
+
+    parser.add_argument("--teacher_type", choices=["bert", "roberta", "gpt2"], required=True,
                         help="Teacher type (BERT, RoBERTa).")
-    parser.add_argument("--teacher_name", default="bert-base-uncased", type=str,
+    parser.add_argument("--teacher_name", type=str, required=True,
                         help="The teacher model.")
 
     parser.add_argument("--temperature", default=2., type=float,
                         help="Temperature for the softmax temperature.")
     parser.add_argument("--alpha_ce", default=0.5, type=float,
                         help="Linear weight for the distillation loss. Must be >=0.")
-    parser.add_argument("--alpha_mlm", default=0.5, type=float,
-                        help="Linear weight for the MLM loss. Must be >=0.")
+    parser.add_argument("--alpha_mlm", default=0.0, type=float,
+                        help="Linear weight for the MLM loss. Must be >=0. Should be used in coonjunction with `mlm` flag.")
+    parser.add_argument("--alpha_clm", default=0.5, type=float,
+                        help="Linear weight for the CLM loss. Must be >=0.")
     parser.add_argument("--alpha_mse", default=0.0, type=float,
                         help="Linear weight of the MSE loss. Must be >=0.")
     parser.add_argument("--alpha_cos", default=0.0, type=float,
                         help="Linear weight of the cosine embedding loss. Must be >=0.")
+
+    parser.add_argument("--mlm", action="store_true",
+                        help="The LM step: MLM or CLM. If `mlm` is True, the MLM is used over CLM.")
     parser.add_argument("--mlm_mask_prop", default=0.15, type=float,
                         help="Proportion of tokens for which we need to make a prediction.")
     parser.add_argument("--word_mask", default=0.8, type=float,
@@ -95,17 +124,20 @@ def main():
                         help="Proportion of tokens to randomly replace.")
     parser.add_argument("--mlm_smoothing", default=0.7, type=float,
                         help="Smoothing parameter to emphasize more rare tokens (see XLM, similar to word2vec).")
+    parser.add_argument("--token_counts", type=str,
+                        help="The token counts in the data_file for MLM.")
+
     parser.add_argument("--restrict_ce_to_mask", action='store_true',
                         help="If true, compute the distilation loss only the [MLM] prediction distribution.")
+    parser.add_argument("--freeze_pos_embs", action="store_true",
+                        help="Freeze positional embeddings during distillation. For student_type in ['roberta', 'gpt2'] only.")
+    parser.add_argument("--freeze_token_type_embds", action="store_true",
+                        help="Freeze token type embeddings during distillation if existent. For student_type in ['roberta'] only.")
 
     parser.add_argument("--n_epoch", type=int, default=3,
                         help="Number of pass on the whole dataset.")
     parser.add_argument("--batch_size", type=int, default=5,
                         help="Batch size (for each process).")
-    parser.add_argument("--tokens_per_batch", type=int, default=-1,
-                        help="If specified, modify the batches so that they have approximately this number of tokens.")
-    parser.add_argument("--shuffle", action='store_false',
-                        help="If true, shuffle the sequence order. Default is true.")
     parser.add_argument("--group_by_size", action='store_false',
                         help="If true, group sequences that have similar length into the same batch. Default is true.")
 
@@ -141,6 +173,7 @@ def main():
     parser.add_argument("--checkpoint_interval", type=int, default=4000,
                         help="Checkpoint interval.")
     args = parser.parse_args()
+    sanity_checks(args)
 
 
     ## ARGS ##
@@ -164,21 +197,19 @@ def main():
         with open(os.path.join(args.dump_path, 'parameters.json'), 'w') as f:
             json.dump(vars(args), f, indent=4)
         git_log(args.dump_path)
-    assert (args.from_pretrained_weights is None and args.from_pretrained_config is None) or \
-           (args.from_pretrained_weights is not None and args.from_pretrained_config is not None)
 
+    student_config_class, student_model_class, _ = MODEL_CLASSES[args.student_type]
+    teacher_config_class, teacher_model_class, teacher_tokenizer_class = MODEL_CLASSES[args.teacher_type]
 
     ### TOKENIZER ###
-    if args.teacher_type == 'bert':
-        tokenizer = BertTokenizer.from_pretrained(args.teacher_name)
-    elif args.teacher_type == 'roberta':
-        tokenizer = RobertaTokenizer.from_pretrained(args.teacher_name)
+    tokenizer = teacher_tokenizer_class.from_pretrained(args.teacher_name)
     special_tok_ids = {}
     for tok_name, tok_symbol in tokenizer.special_tokens_map.items():
         idx = tokenizer.all_special_tokens.index(tok_symbol)
         special_tok_ids[tok_name] = tokenizer.all_special_ids[idx]
     logger.info(f'Special tokens {special_tok_ids}')
     args.special_tok_ids = special_tok_ids
+    args.max_model_input_size = tokenizer.max_model_input_sizes[args.teacher_name]
 
 
     ## DATA LOADER ##
@@ -187,35 +218,34 @@ def main():
         data = pickle.load(fp)
 
 
-    assert os.path.isfile(args.token_counts)
-    logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
-    with open(args.token_counts, 'rb') as fp:
-        counts = pickle.load(fp)
-        assert len(counts) == args.vocab_size
-    token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
-    for idx in special_tok_ids.values():
-        token_probs[idx] = 0.  # do not predict special tokens
-    token_probs = torch.from_numpy(token_probs)
+    if args.mlm:
+        logger.info(f'Loading token counts from {args.token_counts} (already pre-computed)')
+        with open(args.token_counts, 'rb') as fp:
+            counts = pickle.load(fp)
+        
+        token_probs = np.maximum(counts, 1) ** -args.mlm_smoothing
+        for idx in special_tok_ids.values():
+            token_probs[idx] = 0.  # do not predict special tokens
+        token_probs = torch.from_numpy(token_probs)
+    else:
+        token_probs = None
 
 
-    train_dataloader = Dataset(params=args, data=data)
+    train_lm_seq_dataset = LmSeqsDataset(params=args, data=data)
     logger.info(f'Data loader created.')
 
 
     ## STUDENT ##
-    if args.from_pretrained_weights is not None:
-        assert os.path.isfile(args.from_pretrained_weights)
-        assert os.path.isfile(args.from_pretrained_config)
-        logger.info(f'Loading pretrained weights from {args.from_pretrained_weights}')
-        logger.info(f'Loading pretrained config from {args.from_pretrained_config}')
-        stu_architecture_config = DistilBertConfig.from_json_file(args.from_pretrained_config)
-        stu_architecture_config.output_hidden_states = True
-        student = DistilBertForMaskedLM.from_pretrained(args.from_pretrained_weights,
-                                                        config=stu_architecture_config)
+    logger.info(f'Loading student config from {args.student_config}')
+    stu_architecture_config = student_config_class.from_pretrained(args.student_config)
+    stu_architecture_config.output_hidden_states = True
+
+    if args.student_pretrained_weights is not None:
+        logger.info(f'Loading pretrained weights from {args.student_pretrained_weights}')
+        student = student_model_class.from_pretrained(args.student_pretrained_weights,
+                                                      config=stu_architecture_config)
     else:
-        args.vocab_size_or_config_json_file = args.vocab_size
-        stu_architecture_config = DistilBertConfig(**vars(args), output_hidden_states=True)
-        student = DistilBertForMaskedLM(stu_architecture_config)
+        student = student_model_class(stu_architecture_config)
 
 
     if args.n_gpu > 0:
@@ -224,18 +254,31 @@ def main():
 
 
     ## TEACHER ##
-    if args.teacher_type == 'bert':
-        teacher = BertForMaskedLM.from_pretrained(args.teacher_name, output_hidden_states=True)
-    elif args.teacher_type == 'roberta':
-        teacher = RobertaForMaskedLM.from_pretrained(args.teacher_name, output_hidden_states=True)
+    teacher = teacher_model_class.from_pretrained(args.teacher_name, output_hidden_states=True)
     if args.n_gpu > 0:
         teacher.to(f'cuda:{args.local_rank}')
     logger.info(f'Teacher loaded from {args.teacher_name}.')
 
+
+    ## FREEZING ##
+    if args.freeze_pos_embs:
+        freeze_pos_embeddings(student, args)
+    if args.freeze_token_type_embds:
+        freeze_token_type_embeddings(student, args)
+
+
+    ## SANITY CHECKS ##
+    assert student.config.vocab_size == teacher.config.vocab_size
+    assert student.config.hidden_size == teacher.config.hidden_size
+    assert student.config.max_position_embeddings == teacher.config.max_position_embeddings
+    if args.mlm:
+        assert token_probs.size(0) == stu_architecture_config.vocab_size
+
+
     ## DISTILLER ##
     torch.cuda.empty_cache()
     distiller = Distiller(params=args,
-                          dataloader=train_dataloader,
+                          dataset=train_lm_seq_dataset,
                           token_probs=token_probs,
                           student=student,
                           teacher=teacher)

From 2a91f6071ff7ada3fe9fc35fcdfe456c323b7788 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 2 Oct 2019 15:02:42 -0400
Subject: [PATCH 25/55] upddate README - TODO updadte link to paper

---
 examples/distillation/README.md | 57 ++++++++++++++++++++++-----------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/examples/distillation/README.md b/examples/distillation/README.md
index 4cddbd3a2e..ad439cf5f8 100644
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -1,22 +1,25 @@
-# DistilBERT
+# Distil*
 
-This folder contains the original code used to train DistilBERT as well as examples showcasing how to use DistilBERT.
+This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT and DistilGPT2.
+
+**2019, October 3rd - Update** We release our [NeurIPS workshop paper](TODO LINK) explaining our approach on DistilBERT. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of DistilGPT2. DistilGPT2 is two times faster and 33% smaller than GPT2.
 
 **2019, September 19th - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 97% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
 
-## What is DistilBERT
+## What is Distil*
 
-DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
+Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
 
-For more information on DistilBERT, please refer to our [detailed blog post](https://medium.com/huggingface/smaller-faster-cheaper-lighter-introducing-distilbert-a-distilled-version-of-bert-8cf3380435b5
-). *Please note that we will publish a formal write-up with updated and more complete results in the near future (September 19th).*
+We have applied the same method to GPT2 and release the weights of the compressed model. On the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test of 15.84 compared to 19.91 for DistilGPT2 (after fine-tuning on the train set).
 
-Here's the updated results on the dev sets of GLUE:
+For more information on DistilBERT, please refer to our [NeurIPS workshop paper](TODO LINK). The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances.
 
-| Model      | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP | RTE | SST-2 | STS-B | WNLI |
+Here are the results on the dev sets of GLUE:
+
+| Model      | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP  | RTE  | SST-2| STS-B| WNLI |
 | :---:      |    :---:    | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:|
 | BERT-base  |  **77.6**   | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
-| DistilBERT |  **75.2**   | 49.1 | 81.8 | 90.2 | 87.0 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
+| DistilBERT |  **76.8**   | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
 
 ## Setup
 
@@ -26,10 +29,12 @@ This part of the library has only be tested with Python3.6+. There are few speci
 
 ## How to use DistilBERT
 
-Transformers includes two pre-trained DistilBERT models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
+Transformers includes two pre-trained Distil* models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
 
 - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
 - `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
+- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset and . The model has 6 layers, 768 dimension and 12 heads, totalizing 82M (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
+- and more to come! 🤗🤗🤗
 
 Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
 
@@ -42,9 +47,11 @@ outputs = model(input_ids)
 last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 ```
 
-## How to train DistilBERT
+Similarly, using DistilGPT2 simply consists in calling the GPT2 classes from a different pretrained checkpoint: `model = GPT2Model.from_pretrained('distilgpt2')`.
 
-In the following, we will explain how you can train your own compressed model.
+## How to train Distil*
+
+In the following, we will explain how you can train DistilBERT.
 
 ### A. Preparing the data
 
@@ -57,7 +64,8 @@ First, we will binarize the data, i.e. tokenize the data and convert each token
 ```bash
 python scripts/binarized_data.py \
     --file_path data/dump.txt \
-    --bert_tokenizer bert-base-uncased \
+    --tokenizer_type bert \
+    --tokenizer_name bert-base-uncased \
     --dump_file data/binarized_text
 ```
 
@@ -66,7 +74,8 @@ Our implementation of masked language modeling loss follows [XLM](https://github
 ```bash
 python scripts/token_counts.py \
     --data_file data/binarized_text.bert-base-uncased.pickle \
-    --token_counts_dump data/token_counts.bert-base-uncased.pickle
+    --token_counts_dump data/token_counts.bert-base-uncased.pickle \
+    --vocab_size 30522
 ```
 
 ### B. Training
@@ -75,6 +84,12 @@ Training with distillation is really simple once you have pre-processed the data
 
 ```bash
 python train.py \
+    --student_type distilbert \
+    --student_config training_configs/distilbert-base-uncased.json \
+    --teacher_type bert \
+    --teacher_name bert-base-uncased \
+    --alpha_ce 0.33 --alpha_mlm 0.33 --alpha_cos 0.33 --mlm \
+    --freeze_pos_embs \
     --dump_path serialization_dir/my_first_training \
     --data_file data/binarized_text.bert-base-uncased.pickle \
     --token_counts data/token_counts.bert-base-uncased.pickle \
@@ -83,7 +98,7 @@ python train.py \
 
 By default, this will launch a training on a single GPU (even if more are available on the cluster). Other parameters are available in the command line, please look in `train.py` or run `python train.py --help` to list them.
 
-We highly encourage you to use distributed training for training DistilBert as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
+We highly encourage you to use distributed training for training DistilBERT as the training corpus is quite large. Here's an example that runs a distributed training on a single node having 4 GPUs:
 
 ```bash
 export NODE_RANK=0
@@ -105,11 +120,17 @@ python -m torch.distributed.launch \
     train.py \
         --force \
         --n_gpu $WORLD_SIZE \
+        --student_type distilbert \
+        --student_config training_configs/distilbert-base-uncased.json \
+        --teacher_type bert \
+        --teacher_name bert-base-uncased \
+        --alpha_ce 0.33 --alpha_mlm 0.33 --alpha_cos 0.33 --mlm \
+        --freeze_pos_embs \
+        --dump_path serialization_dir/my_first_training \
         --data_file data/binarized_text.bert-base-uncased.pickle \
-        --token_counts data/token_counts.bert-base-uncased.pickle \
-        --dump_path serialization_dir/my_first_distillation
+        --token_counts data/token_counts.bert-base-uncased.pickle
 ```
 
-**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract_for_distil.py` to create a valid initialization checkpoint and use `--from_pretrained_weights` and `--from_pretrained_config` arguments to use this initialization for the distilled training!
+**Tips:** Starting distillated training with good initialization of the model weights is crucial to reach decent performance. In our experiments, we initialized our model from a few layers of the teacher (Bert) itself! Please refer to `scripts/extract.py` and `scripts/extract_distilbert.py` to create a valid initialization checkpoint and use `--student_pretrained_weights` argument to use this initialization for the distilled training!
 
 Happy distillation!

From f1f23ad1710953e75b53a85953b018b8caceb427 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 2 Oct 2019 19:03:32 -0400
Subject: [PATCH 26/55] fix buf in convert_pt_chkpt_to_tf2

---
 transformers/convert_pytorch_checkpoint_to_tf2.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index c5f7650b50..d8a48e9dcd 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -228,6 +228,7 @@ if __name__ == "__main__":
     convert_all_pt_checkpoints_to_tf(args.model_type.lower() if args.model_type is not None else None,
                                         args.tf_dump_path,
                                         model_shortcut_names_or_path=[args.pytorch_checkpoint_path] if args.pytorch_checkpoint_path is not None else None,
+                                        config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
                                         compare_with_pt_model=args.compare_with_pt_model,
                                         use_cached_models=args.use_cached_models,
                                         only_convert_finetuned_models=args.only_convert_finetuned_models)

From 35071007cb1600acf7e8197e42f16e3698dc5f35 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 2 Oct 2019 23:01:36 -0400
Subject: [PATCH 27/55] =?UTF-8?q?incoming=20release=20=F0=9F=94=A5=20updat?=
 =?UTF-8?q?e=20links=20to=20arxiv=20preprint?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                       | 3 +--
 examples/distillation/README.md | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index b5b7245bd9..4b4f6d5def 100644
--- a/README.md
+++ b/README.md
@@ -120,8 +120,7 @@ At some point in the future, you'll be able to seamlessly move from pre-training
 5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
 7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the blogpost [Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT](https://medium.com/huggingface/distilbert-8cf3380435b5
-) by Victor Sanh, Lysandre Debut and Thomas Wolf.
+8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation).
 
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
 
diff --git a/examples/distillation/README.md b/examples/distillation/README.md
index ad439cf5f8..8436ab95bf 100644
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -2,7 +2,7 @@
 
 This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT and DistilGPT2.
 
-**2019, October 3rd - Update** We release our [NeurIPS workshop paper](TODO LINK) explaining our approach on DistilBERT. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of DistilGPT2. DistilGPT2 is two times faster and 33% smaller than GPT2.
+**2019, October 3rd - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2.
 
 **2019, September 19th - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 97% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
 
@@ -12,7 +12,7 @@ Distil* is a class of compressed models that started with DistilBERT. DistilBERT
 
 We have applied the same method to GPT2 and release the weights of the compressed model. On the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test of 15.84 compared to 19.91 for DistilGPT2 (after fine-tuning on the train set).
 
-For more information on DistilBERT, please refer to our [NeurIPS workshop paper](TODO LINK). The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances.
+For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108). The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances.
 
 Here are the results on the dev sets of GLUE:
 
@@ -88,7 +88,7 @@ python train.py \
     --student_config training_configs/distilbert-base-uncased.json \
     --teacher_type bert \
     --teacher_name bert-base-uncased \
-    --alpha_ce 0.33 --alpha_mlm 0.33 --alpha_cos 0.33 --mlm \
+    --alpha_ce 5.0 --alpha_mlm 2.0 --alpha_cos 1.0 --mlm \
     --freeze_pos_embs \
     --dump_path serialization_dir/my_first_training \
     --data_file data/binarized_text.bert-base-uncased.pickle \

From 5f07d8f11a66ea58be31b93a0dc21f428b7e1714 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 3 Oct 2019 09:59:32 -0400
Subject: [PATCH 28/55] prepare release

---
 examples/distillation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/distillation/README.md b/examples/distillation/README.md
index 8436ab95bf..5b6fbd2e9a 100644
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -10,7 +10,7 @@ This folder contains the original code used to train Distil* as well as examples
 
 Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
 
-We have applied the same method to GPT2 and release the weights of the compressed model. On the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test of 15.84 compared to 19.91 for DistilGPT2 (after fine-tuning on the train set).
+We have applied the same method to GPT2 and release the weights of the compressed model. On the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test of 15.8 compared to 19.3 for DistilGPT2 (after fine-tuning on the train set).
 
 For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108). The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances.
 

From 6be46a6e6422d7ab34984d6fbcaadad0323e5349 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 3 Oct 2019 10:07:18 -0400
Subject: [PATCH 29/55] update links to new weights

---
 transformers/configuration_gpt2.py | 3 ++-
 transformers/modeling_gpt2.py      | 3 ++-
 transformers/modeling_tf_gpt2.py   | 3 ++-
 transformers/tokenization_gpt2.py  | 3 +++
 4 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/transformers/configuration_gpt2.py b/transformers/configuration_gpt2.py
index c83d9e82ce..e7d853f317 100644
--- a/transformers/configuration_gpt2.py
+++ b/transformers/configuration_gpt2.py
@@ -28,7 +28,8 @@ logger = logging.getLogger(__name__)
 
 GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-config.json",
                                       "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-config.json",
-                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json"}
+                                      "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-config.json",
+                                      "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-config.json",}
 
 class GPT2Config(PretrainedConfig):
     """Configuration class to store the configuration of a `GPT2Model`.
diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py
index bc85224022..891dfc5677 100644
--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -38,7 +38,8 @@ logger = logging.getLogger(__name__)
 
 GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-pytorch_model.bin",
                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-pytorch_model.bin",
-                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin"}
+                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-pytorch_model.bin",
+                                     "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-pytorch_model.bin",}
 
 def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
     """ Load tf checkpoints in a pytorch model
diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py
index e958c2cbf1..883340cac9 100644
--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -38,7 +38,8 @@ logger = logging.getLogger(__name__)
 
 TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP = {"gpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-tf_model.h5",
                                      "gpt2-medium": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-tf_model.h5",
-                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5"}
+                                     "gpt2-large": "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-tf_model.h5",
+                                     "distilgpt2": "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-tf_model.h5",}
 
 
 def load_gpt2_pt_weights_in_tf2(tf_model, pytorch_checkpoint_path):
diff --git a/transformers/tokenization_gpt2.py b/transformers/tokenization_gpt2.py
index 3e931dfcf8..6a7f75acb2 100644
--- a/transformers/tokenization_gpt2.py
+++ b/transformers/tokenization_gpt2.py
@@ -46,12 +46,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
         'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json",
         'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-vocab.json",
         'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-vocab.json",
+        'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-vocab.json",
     },
     'merges_file':
     {
         'gpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt",
         'gpt2-medium': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-medium-merges.txt",
         'gpt2-large': "https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-large-merges.txt",
+        'distilgpt2': "https://s3.amazonaws.com/models.huggingface.co/bert/distilgpt2-merges.txt",
     },
 }
 
@@ -59,6 +61,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'gpt2': 1024,
     'gpt2-medium': 1024,
     'gpt2-large': 1024,
+    'distilgpt2': 1024,
 }
 
 @lru_cache()

From 4a790c40b1817fd457043f9933266b4d5e20b3b7 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 3 Oct 2019 10:54:02 -0400
Subject: [PATCH 30/55] update doc for distil*

---
 docs/source/pretrained_models.rst | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index c12a9bc52f..7606082c7e 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -119,11 +119,14 @@ Here is the full list of the currently provided pretrained models together with
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | DistilBERT        | ``distilbert-base-uncased``                                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint                                                   |
-|                   |                                                            | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__)                                                            |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``distilbert-base-uncased-distilled-squad``                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
-|                   |                                                            | (see `details <https://medium.com/huggingface/distilbert-8cf3380435b5>`__)                                                            |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``distilbert-base-uncased-distilled-squad``                | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
+|                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-
 .. <https://huggingface.co/transformers/examples.html>`__
\ No newline at end of file

From c1689ac30164d190f366d95d1f5153af53e66355 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 3 Oct 2019 10:56:39 -0400
Subject: [PATCH 31/55] fix name

---
 docs/source/pretrained_models.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index 7606082c7e..2622f3cd80 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -125,7 +125,7 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``distilbert-base-uncased-distilled-squad``                | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
+|                   | ``distilgpt2``                                             | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
 |                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+

From 7af0777910f1450965819111c6cd5a637630d086 Mon Sep 17 00:00:00 2001
From: Brian Ma <brian41005@gmail.com>
Date: Thu, 3 Oct 2019 16:29:43 +0800
Subject: [PATCH 32/55] Update run_glue.py

add DistilBert model shortcut into ALL_MODELS
---
 examples/run_glue.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index fc3b617da0..e02e9b4294 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -53,7 +53,8 @@ from transformers import glue_convert_examples_to_features as convert_examples_t
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig)), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, 
+                                                                                RobertaConfig, DistilBertConfig)), ())
 
 MODEL_CLASSES = {
     'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),

From e2ae9c0b73e87a0d8053046ba1b33c3632750028 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 3 Oct 2019 11:42:21 -0400
Subject: [PATCH 33/55] fix links in doc index

---
 docs/source/index.rst | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 8c76b89185..3b4fe4d1e8 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -46,8 +46,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
 5. `XLNet <https://github.com/zihangdai/xlnet>`_ (from Google/CMU) released with the paper `​XLNet: Generalized Autoregressive Pretraining for Language Understanding <https://arxiv.org/abs/1906.08237>`_ by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. `XLM <https://github.com/facebookresearch/XLM>`_ (from Facebook) released together with the paper `Cross-lingual Language Model Pretraining <https://arxiv.org/abs/1901.07291>`_ by Guillaume Lample and Alexis Conneau.
 7. `RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`_ (from Facebook), released together with the paper a `Robustly Optimized BERT Pretraining Approach <https://arxiv.org/abs/1907.11692>`_ by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
-8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the blog post `Smaller, faster, cheaper, lighter: Introducing DistilBERT, a distilled version of BERT <https://medium.com/huggingface/distilbert-8cf3380435b5>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf.
-
+8. `DistilBERT <https://huggingface.co/transformers/model_doc/distilbert.html>`_ (from HuggingFace) released together with the paper `DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter <https://arxiv.org/abs/1910.01108>`_ by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into `DistilGPT2 <https://github.com/huggingface/transformers/tree/master/examples/distillation>`_.
 
 .. toctree::
     :maxdepth: 2

From e1b2949ae6cb34cc39e3934ca87423474f8c8d02 Mon Sep 17 00:00:00 2001
From: drc10723 <drcjudelhi@gmail.com>
Date: Thu, 3 Oct 2019 21:22:36 +0530
Subject: [PATCH 34/55] DistillBert Documentation Code Example fixes

---
 transformers/modeling_distilbert.py    | 2 +-
 transformers/modeling_tf_distilbert.py | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py
index 2425ab5f47..ebb89f0f95 100644
--- a/transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -649,7 +649,7 @@ class DistilBertForQuestionAnswering(DistilBertPreTrainedModel):
         start_positions = torch.tensor([1])
         end_positions = torch.tensor([3])
         outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
-        loss, start_scores, end_scores = outputs[:2]
+        loss, start_scores, end_scores = outputs[:3]
 
     """
     def __init__(self, config):
diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py
index 5ce1616bcc..6ed2844567 100644
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -603,7 +603,7 @@ class TFDistilBertForMaskedLM(TFDistilBertPreTrainedModel):
         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
         model = TFDistilBertForMaskedLM.from_pretrained('distilbert-base-uncased')
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids, masked_lm_labels=input_ids)
+        outputs = model(input_ids)
         prediction_scores = outputs[0]
 
     """
@@ -715,9 +715,7 @@ class TFDistilBertForQuestionAnswering(TFDistilBertPreTrainedModel):
         tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
         model = TFDistilBertForQuestionAnswering.from_pretrained('distilbert-base-uncased')
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        start_positions = tf.constant([1])
-        end_positions = tf.constant([3])
-        outputs = model(input_ids, start_positions=start_positions, end_positions=end_positions)
+        outputs = model(input_ids)
         start_scores, end_scores = outputs[:2]
 
     """

From c2c2ca0fdba6f16f2e66d0a21152aa4ae493ca78 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 3 Oct 2019 17:18:48 -0400
Subject: [PATCH 35/55] Added XLM to run_generation, with prompt language
 selection.

---
 examples/run_generation.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/examples/run_generation.py b/examples/run_generation.py
index 9e98a9e870..a70a0e7842 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -26,12 +26,13 @@ import torch
 import torch.nn.functional as F
 import numpy as np
 
-from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig
+from transformers import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig
 
 from transformers import GPT2LMHeadModel, GPT2Tokenizer
 from transformers import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
 from transformers import XLNetLMHeadModel, XLNetTokenizer
 from transformers import TransfoXLLMHeadModel, TransfoXLTokenizer
+from transformers import XLMWithLMHeadModel, XLMTokenizer
 
 
 logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
@@ -41,13 +42,14 @@ logger = logging.getLogger(__name__)
 
 MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, XLMConfig)), ())
 
 MODEL_CLASSES = {
     'gpt2': (GPT2LMHeadModel, GPT2Tokenizer),
     'openai-gpt': (OpenAIGPTLMHeadModel, OpenAIGPTTokenizer),
     'xlnet': (XLNetLMHeadModel, XLNetTokenizer),
     'transfo-xl': (TransfoXLLMHeadModel, TransfoXLTokenizer),
+    'xlm': (XLMWithLMHeadModel, XLMTokenizer),
 }
 
 # Padding text to help Transformer-XL and XLNet with short prompts as proposed by Aman Rusia
@@ -103,7 +105,8 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')
     return logits
 
 
-def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, is_xlnet=False, device='cpu'):
+def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=0, top_p=0.0, is_xlnet=False,
+                    xlm_lang=None, device='cpu'):
     context = torch.tensor(context, dtype=torch.long, device=device)
     context = context.unsqueeze(0).repeat(num_samples, 1)
     generated = context
@@ -121,6 +124,9 @@ def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=
                 target_mapping[0, 0, -1] = 1.0  # predict last token
                 inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}
 
+            if xlm_lang is not None:
+                inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1]).view(1, -1)
+
             outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
             next_token_logits = outputs[0][0, -1, :] / temperature
             filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
@@ -137,6 +143,7 @@ def main():
                         help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
     parser.add_argument("--prompt", type=str, default="")
     parser.add_argument("--padding_text", type=str, default="")
+    parser.add_argument("--xlm_lang", type=str, default="", help="Optional language when used with the XLM model.")
     parser.add_argument("--length", type=int, default=20)
     parser.add_argument("--temperature", type=float, default=1.0)
     parser.add_argument("--top_k", type=int, default=0)
@@ -168,6 +175,17 @@ def main():
 
     print(args)
     while True:
+        xlm_lang = None
+        # XLM Language usage detailed in the issues #1414
+        if args.model_type in ["xlm"] and hasattr(tokenizer, 'lang2id'):
+            if args.xlm_lang:
+                language = args.xlm_lang
+            else:
+                language = None
+                while language not in tokenizer.lang2id.keys():
+                    language = input("Using XLM. Select language in " + str(list(tokenizer.lang2id.keys())) + " >>> ")
+            xlm_lang = tokenizer.lang2id[language]
+
         raw_text = args.prompt if args.prompt else input("Model prompt >>> ")
         if args.model_type in ["transfo-xl", "xlnet"]:
             # Models with memory likes to have a long prompt for short inputs.
@@ -180,11 +198,12 @@ def main():
             temperature=args.temperature,
             top_k=args.top_k,
             top_p=args.top_p,
-            device=args.device,
             is_xlnet=bool(args.model_type == "xlnet"),
+            xlm_lang=xlm_lang,
+            device=args.device,
         )
         out = out[0, len(context_tokens):].tolist()
-        text = tokenizer.decode(out, clean_up_tokenization_spaces=True)
+        text = tokenizer.decode(out, clean_up_tokenization_spaces=True, skip_special_tokens=True)
         print(text)
         if args.prompt:
             break

From ecc4f1bdfae5b3a9679fae499c5c9b375d927547 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 3 Oct 2019 17:42:16 -0400
Subject: [PATCH 36/55] XLM use_lang_embedding flag in run_generation

---
 examples/run_generation.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/run_generation.py b/examples/run_generation.py
index a70a0e7842..83926f42b7 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -177,7 +177,8 @@ def main():
     while True:
         xlm_lang = None
         # XLM Language usage detailed in the issues #1414
-        if args.model_type in ["xlm"] and hasattr(tokenizer, 'lang2id'):
+        if args.model_type in ["xlm"] and hasattr(tokenizer, 'lang2id') and hasattr(model.config, 'use_lang_emb') \
+                and model.config.use_lang_emb:
             if args.xlm_lang:
                 language = args.xlm_lang
             else:

From 7bddb45a6f2a6d2edf2bedde3817041a9d169d2b Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 4 Oct 2019 14:27:38 -0400
Subject: [PATCH 37/55] Decode documentaton

---
 transformers/tokenization_utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index de3f48f4c3..a712703190 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -912,6 +912,11 @@ class PreTrainedTokenizer(object):
         Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
         with options to remove special tokens and clean up tokenization spaces.
         Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
+
+        Args:
+            token_ids: list of tokenized input ids. Can be obtained using the `encode` or `encode_plus` methods.
+            skip_special_tokens: if set to True, will replace special tokens.
+            clean_up_tokenization_spaces: if set to True, will clean up the tokenization spaces.
         """
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
 

From bb464289ce41257d554e6a38c4f68e97757dd7da Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Fri, 4 Oct 2019 16:41:26 -0400
Subject: [PATCH 38/55] New model addition issue template

---
 .../ISSUE_TEMPLATE/--new-model-addition.md    | 23 +++++++++++++++++++
 .github/ISSUE_TEMPLATE/bug-report.md          |  6 ++++-
 .github/ISSUE_TEMPLATE/feature-request.md     |  6 ++++-
 .github/ISSUE_TEMPLATE/migration.md           |  6 ++++-
 .github/ISSUE_TEMPLATE/question-help.md       |  6 ++++-
 5 files changed, 43 insertions(+), 4 deletions(-)
 create mode 100644 .github/ISSUE_TEMPLATE/--new-model-addition.md

diff --git a/.github/ISSUE_TEMPLATE/--new-model-addition.md b/.github/ISSUE_TEMPLATE/--new-model-addition.md
new file mode 100644
index 0000000000..96fd85269d
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/--new-model-addition.md
@@ -0,0 +1,23 @@
+---
+name: "\U0001F31FNew model addition"
+about: Submit a proposal/request to implement a new Transformer-based model
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+# 🌟New model addition
+
+## Model description
+
+<!-- Important information -->
+
+## Open Source status
+
+* [ ] the model implementation is available: (give details)
+* [ ] the model weights are available: (give details)
+
+## Additional context
+
+<!-- Add any other context about the problem here. -->
diff --git a/.github/ISSUE_TEMPLATE/bug-report.md b/.github/ISSUE_TEMPLATE/bug-report.md
index 66f7831aea..337c980ac9 100644
--- a/.github/ISSUE_TEMPLATE/bug-report.md
+++ b/.github/ISSUE_TEMPLATE/bug-report.md
@@ -1,6 +1,10 @@
 ---
 name: "\U0001F41B Bug Report"
 about: Submit a bug report to help us improve PyTorch Transformers
+title: ''
+labels: ''
+assignees: ''
+
 ---
 
 ## 🐛 Bug
@@ -45,4 +49,4 @@ Steps to reproduce the behavior:
 
 ## Additional context
 
-<!-- Add any other context about the problem here. -->
\ No newline at end of file
+<!-- Add any other context about the problem here. -->
diff --git a/.github/ISSUE_TEMPLATE/feature-request.md b/.github/ISSUE_TEMPLATE/feature-request.md
index 828e3737be..df874ae929 100644
--- a/.github/ISSUE_TEMPLATE/feature-request.md
+++ b/.github/ISSUE_TEMPLATE/feature-request.md
@@ -1,6 +1,10 @@
 ---
 name: "\U0001F680 Feature Request"
 about: Submit a proposal/request for a new PyTorch Transformers feature
+title: ''
+labels: ''
+assignees: ''
+
 ---
 
 ## 🚀 Feature
@@ -13,4 +17,4 @@ about: Submit a proposal/request for a new PyTorch Transformers feature
 
 ## Additional context
 
-<!-- Add any other context or screenshots about the feature request here. -->
\ No newline at end of file
+<!-- Add any other context or screenshots about the feature request here. -->
diff --git a/.github/ISSUE_TEMPLATE/migration.md b/.github/ISSUE_TEMPLATE/migration.md
index 8ce1bc8fdd..d1dbe3dcfd 100644
--- a/.github/ISSUE_TEMPLATE/migration.md
+++ b/.github/ISSUE_TEMPLATE/migration.md
@@ -1,6 +1,10 @@
 ---
 name: "\U0001F4DA Migration from PyTorch-pretrained-Bert"
 about: Report a problem when migrating from PyTorch-pretrained-Bert to Transformers
+title: ''
+labels: ''
+assignees: ''
+
 ---
 
 ## 📚 Migration
@@ -40,4 +44,4 @@ Details of the issue:
 
 ## Additional context
 
-<!-- Add any other context about the problem here. -->
\ No newline at end of file
+<!-- Add any other context about the problem here. -->
diff --git a/.github/ISSUE_TEMPLATE/question-help.md b/.github/ISSUE_TEMPLATE/question-help.md
index 8c76994b02..77187a495f 100644
--- a/.github/ISSUE_TEMPLATE/question-help.md
+++ b/.github/ISSUE_TEMPLATE/question-help.md
@@ -1,8 +1,12 @@
 ---
 name: "❓Questions & Help"
 about: Start a general discussion related to PyTorch Transformers
+title: ''
+labels: ''
+assignees: ''
+
 ---
 
 ## ❓ Questions & Help
 
-<!-- A clear and concise description of the question. -->
\ No newline at end of file
+<!-- A clear and concise description of the question. -->

From 764a7923ec1ca27a3c35e6c3eb7c1574f75e741c Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Fri, 27 Sep 2019 17:44:32 -0400
Subject: [PATCH 39/55] add distillation+finetuning option in run_squad

---
 examples/run_squad.py | 64 ++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 57 insertions(+), 7 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 0c0fbf2963..922a323087 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for question-answering on SQuAD (Bert, XLM, XLNet)."""
+""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet) with an optional step of distillation."""
 
 from __future__ import absolute_import, division, print_function
 
@@ -28,6 +28,8 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
+import torch.nn.functional as F
+import torch.nn as nn
 from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
@@ -73,7 +75,7 @@ def set_seed(args):
 def to_list(tensor):
     return tensor.detach().cpu().tolist()
 
-def train(args, train_dataset, model, tokenizer):
+def train(args, train_dataset, model, tokenizer, teacher=None):
     """ Train the model """
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
@@ -132,17 +134,40 @@ def train(args, train_dataset, model, tokenizer):
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
             model.train()
+            if teacher is not None:
+                teacher.eval()
             batch = tuple(t.to(args.device) for t in batch)
             inputs = {'input_ids':       batch[0],
                       'attention_mask':  batch[1], 
-                      'token_type_ids':  None if args.model_type == 'xlm' else batch[2],  
                       'start_positions': batch[3], 
                       'end_positions':   batch[4]}
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
             if args.model_type in ['xlnet', 'xlm']:
                 inputs.update({'cls_index': batch[5],
                                'p_mask':       batch[6]})
             outputs = model(**inputs)
-            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+            loss, start_logits_stu, end_logits_stu = outputs
+
+            # Distillation loss
+            if teacher is not None:
+                if 'token_type_ids' not in inputs:
+                    inputs['token_type_ids'] = None if args.teacher_type == 'xlm' else batch[2]
+                with torch.no_grad():
+                    start_logits_tea, end_logits_tea = teacher(input_ids=inputs['input_ids'],
+                                                               token_type_ids=inputs['token_type_ids'],
+                                                               attention_mask=inputs['attention_mask'])
+                assert start_logits_tea.size() == start_logits_stu.size()
+                assert end_logits_tea.size() == end_logits_stu.size()
+                
+                loss_fct = nn.KLDivLoss(reduction='batchmean')
+                loss_start = loss_fct(F.log_softmax(start_logits_stu/args.temperature, dim=-1),
+                                      F.softmax(start_logits_tea/args.temperature, dim=-1)) * (args.temperature**2)
+                loss_end = loss_fct(F.log_softmax(end_logits_stu/args.temperature, dim=-1),
+                                    F.softmax(end_logits_tea/args.temperature, dim=-1)) * (args.temperature**2)
+                loss_ce = (loss_start + loss_end)/2.
+
+                loss = args.alpha_ce*loss_ce + args.alpha_squad*loss
 
             if args.n_gpu > 1:
                 loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
@@ -218,9 +243,10 @@ def evaluate(args, model, tokenizer, prefix=""):
         batch = tuple(t.to(args.device) for t in batch)
         with torch.no_grad():
             inputs = {'input_ids':      batch[0],
-                      'attention_mask': batch[1],
-                      'token_type_ids': None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+                      'attention_mask': batch[1]
                       }
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
             example_indices = batch[3]
             if args.model_type in ['xlnet', 'xlm']:
                 inputs.update({'cls_index': batch[4],
@@ -341,6 +367,18 @@ def main():
     parser.add_argument("--output_dir", default=None, type=str, required=True,
                         help="The output directory where the model checkpoints and predictions will be written.")
 
+    # Distillation parameters (optional)
+    parser.add_argument('--teacher_type', default=None, type=str,
+                        help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.")
+    parser.add_argument('--teacher_name_or_path', default=None, type=str,
+                        help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.")
+    parser.add_argument('--alpha_ce', default=0.5, type=float,
+                        help="Distillation loss linear weight. Only for distillation.")
+    parser.add_argument('--alpha_squad', default=0.5, type=float,
+                        help="True SQuAD loss linear weight. Only for distillation.")
+    parser.add_argument('--temperature', default=2.0, type=float,
+                        help="Distillation temperature. Only for distillation.")
+
     ## Other parameters
     parser.add_argument("--config_name", default="", type=str,
                         help="Pretrained config name or path if not the same as model_name")
@@ -468,6 +506,18 @@ def main():
     tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
     model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
 
+    if args.teacher_type is not None:
+        assert args.teacher_name_or_path is not None
+        assert args.alpha_ce > 0.
+        assert args.alpha_ce + args.alpha_squad > 0.
+        assert args.teacher_type != 'distilbert', "We constraint teachers not to be of type DistilBERT."
+        teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type]
+        teacher_config = teacher_config_class.from_pretrained(args.teacher_name_or_path)
+        teacher = teacher_model_class.from_pretrained(args.teacher_name_or_path, config=teacher_config)
+        teacher.to(args.device)
+    else:
+        teacher = None
+
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 
@@ -478,7 +528,7 @@ def main():
     # Training
     if args.do_train:
         train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher=teacher)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
 

From f5891c3821ab5e2bc547d854596e270ca451e862 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Fri, 4 Oct 2019 17:15:04 -0400
Subject: [PATCH 40/55] run_squad --> run_squad_w_distillation

---
 .../distillation/run_squad_w_distillation.py  | 585 ++++++++++++++++++
 examples/run_squad.py                         |  55 +-
 2 files changed, 589 insertions(+), 51 deletions(-)
 create mode 100644 examples/distillation/run_squad_w_distillation.py

diff --git a/examples/distillation/run_squad_w_distillation.py b/examples/distillation/run_squad_w_distillation.py
new file mode 100644
index 0000000000..4be641dd81
--- /dev/null
+++ b/examples/distillation/run_squad_w_distillation.py
@@ -0,0 +1,585 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" This is the exact same script as `examples/run_squad.py` (as of 2019, October 4th) with an additional and optional step of distillation."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import os
+import random
+import glob
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+import torch.nn.functional as F
+import torch.nn as nn
+from tqdm import tqdm, trange
+
+from tensorboardX import SummaryWriter
+
+from transformers import (WEIGHTS_NAME, BertConfig,
+                                  BertForQuestionAnswering, BertTokenizer,
+                                  XLMConfig, XLMForQuestionAnswering,
+                                  XLMTokenizer, XLNetConfig,
+                                  XLNetForQuestionAnswering,
+                                  XLNetTokenizer,
+                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+
+from transformers import AdamW, WarmupLinearSchedule
+
+from ..utils_squad import (read_squad_examples, convert_examples_to_features,
+                         RawResult, write_predictions,
+                         RawResultExtended, write_predictions_extended)
+
+# The follwing import is the official SQuAD evaluation script (2.0).
+# You can remove it from the dependencies if you are using this script outside of the library
+# We've added it here for automated tests (see examples/test_examples.py file)
+from ..utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
+
+logger = logging.getLogger(__name__)
+
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
+                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+
+MODEL_CLASSES = {
+    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+}
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+def to_list(tensor):
+    return tensor.detach().cpu().tolist()
+
+def train(args, train_dataset, model, tokenizer, teacher=None):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            if teacher is not None:
+                teacher.eval()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {'input_ids':       batch[0],
+                      'attention_mask':  batch[1], 
+                      'start_positions': batch[3], 
+                      'end_positions':   batch[4]}
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
+            if args.model_type in ['xlnet', 'xlm']:
+                inputs.update({'cls_index': batch[5],
+                               'p_mask':       batch[6]})
+            outputs = model(**inputs)
+            loss, start_logits_stu, end_logits_stu = outputs
+
+            # Distillation loss
+            if teacher is not None:
+                if 'token_type_ids' not in inputs:
+                    inputs['token_type_ids'] = None if args.teacher_type == 'xlm' else batch[2]
+                with torch.no_grad():
+                    start_logits_tea, end_logits_tea = teacher(input_ids=inputs['input_ids'],
+                                                               token_type_ids=inputs['token_type_ids'],
+                                                               attention_mask=inputs['attention_mask'])
+                assert start_logits_tea.size() == start_logits_stu.size()
+                assert end_logits_tea.size() == end_logits_stu.size()
+                
+                loss_fct = nn.KLDivLoss(reduction='batchmean')
+                loss_start = loss_fct(F.log_softmax(start_logits_stu/args.temperature, dim=-1),
+                                      F.softmax(start_logits_tea/args.temperature, dim=-1)) * (args.temperature**2)
+                loss_end = loss_fct(F.log_softmax(end_logits_stu/args.temperature, dim=-1),
+                                    F.softmax(end_logits_tea/args.temperature, dim=-1)) * (args.temperature**2)
+                loss_ce = (loss_start + loss_end)/2.
+
+                loss = args.alpha_ce*loss_ce + args.alpha_squad*loss
+
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+            else:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
+
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Eval!
+    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info("  Num examples = %d", len(dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    all_results = []
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        model.eval()
+        batch = tuple(t.to(args.device) for t in batch)
+        with torch.no_grad():
+            inputs = {'input_ids':      batch[0],
+                      'attention_mask': batch[1]
+                      }
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+            example_indices = batch[3]
+            if args.model_type in ['xlnet', 'xlm']:
+                inputs.update({'cls_index': batch[4],
+                               'p_mask':    batch[5]})
+            outputs = model(**inputs)
+
+        for i, example_index in enumerate(example_indices):
+            eval_feature = features[example_index.item()]
+            unique_id = int(eval_feature.unique_id)
+            if args.model_type in ['xlnet', 'xlm']:
+                # XLNet uses a more complex post-processing procedure
+                result = RawResultExtended(unique_id            = unique_id,
+                                           start_top_log_probs  = to_list(outputs[0][i]),
+                                           start_top_index      = to_list(outputs[1][i]),
+                                           end_top_log_probs    = to_list(outputs[2][i]),
+                                           end_top_index        = to_list(outputs[3][i]),
+                                           cls_logits           = to_list(outputs[4][i]))
+            else:
+                result = RawResult(unique_id    = unique_id,
+                                   start_logits = to_list(outputs[0][i]),
+                                   end_logits   = to_list(outputs[1][i]))
+            all_results.append(result)
+
+    # Compute predictions
+    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
+    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
+    if args.version_2_with_negative:
+        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
+    else:
+        output_null_log_odds_file = None
+
+    if args.model_type in ['xlnet', 'xlm']:
+        # XLNet uses a more complex post-processing procedure
+        write_predictions_extended(examples, features, all_results, args.n_best_size,
+                        args.max_answer_length, output_prediction_file,
+                        output_nbest_file, output_null_log_odds_file, args.predict_file,
+                        model.config.start_n_top, model.config.end_n_top,
+                        args.version_2_with_negative, tokenizer, args.verbose_logging)
+    else:
+        write_predictions(examples, features, all_results, args.n_best_size,
+                        args.max_answer_length, args.do_lower_case, output_prediction_file,
+                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
+                        args.version_2_with_negative, args.null_score_diff_threshold)
+
+    # Evaluate with the official SQuAD script
+    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
+                                 pred_file=output_prediction_file,
+                                 na_prob_file=output_null_log_odds_file)
+    results = evaluate_on_squad(evaluate_options)
+    return results
+
+
+def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
+    if args.local_rank not in [-1, 0] and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Load data features from cache or dataset file
+    input_file = args.predict_file if evaluate else args.train_file
+    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        str(args.max_seq_length)))
+    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", input_file)
+        examples = read_squad_examples(input_file=input_file,
+                                                is_training=not evaluate,
+                                                version_2_with_negative=args.version_2_with_negative)
+        features = convert_examples_to_features(examples=examples,
+                                                tokenizer=tokenizer,
+                                                max_seq_length=args.max_seq_length,
+                                                doc_stride=args.doc_stride,
+                                                max_query_length=args.max_query_length,
+                                                is_training=not evaluate)
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    if args.local_rank == 0 and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+    if evaluate:
+        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_example_index, all_cls_index, all_p_mask)
+    else:
+        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_start_positions, all_end_positions,
+                                all_cls_index, all_p_mask)
+
+    if output_examples:
+        return dataset, examples, features
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--train_file", default=None, type=str, required=True,
+                        help="SQuAD json for training. E.g., train-v1.1.json")
+    parser.add_argument("--predict_file", default=None, type=str, required=True,
+                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model checkpoints and predictions will be written.")
+
+    # Distillation parameters (optional)
+    parser.add_argument('--teacher_type', default=None, type=str,
+                        help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.")
+    parser.add_argument('--teacher_name_or_path', default=None, type=str,
+                        help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.")
+    parser.add_argument('--alpha_ce', default=0.5, type=float,
+                        help="Distillation loss linear weight. Only for distillation.")
+    parser.add_argument('--alpha_squad', default=0.5, type=float,
+                        help="True SQuAD loss linear weight. Only for distillation.")
+    parser.add_argument('--temperature', default=2.0, type=float,
+                        help="Distillation temperature. Only for distillation.")
+
+    ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+
+    parser.add_argument('--version_2_with_negative', action='store_true',
+                        help='If true, the SQuAD examples contain some that do not have an answer.')
+    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
+                        help="If null_score - best_non_null is greater than the threshold predict null.")
+
+    parser.add_argument("--max_seq_length", default=384, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+                             "longer than this will be truncated, and sequences shorter than this will be padded.")
+    parser.add_argument("--doc_stride", default=128, type=int,
+                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
+    parser.add_argument("--max_query_length", default=64, type=int,
+                        help="The maximum number of tokens for the question. Questions longer than this will "
+                             "be truncated to this length.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--n_best_size", default=20, type=int,
+                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
+    parser.add_argument("--max_answer_length", default=30, type=int,
+                        help="The maximum length of an answer that can be generated. This is needed because the start "
+                             "and end predictions are not conditioned on one another.")
+    parser.add_argument("--verbose_logging", action='store_true',
+                        help="If true, all of the warnings related to data processing will be printed. "
+                             "A number of warnings are expected for a normal SQuAD evaluation.")
+
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+
+    # Set seed
+    set_seed(args)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+
+    if args.teacher_type is not None:
+        assert args.teacher_name_or_path is not None
+        assert args.alpha_ce > 0.
+        assert args.alpha_ce + args.alpha_squad > 0.
+        assert args.teacher_type != 'distilbert', "We constraint teachers not to be of type DistilBERT."
+        teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type]
+        teacher_config = teacher_config_class.from_pretrained(args.teacher_name_or_path)
+        teacher = teacher_model_class.from_pretrained(args.teacher_name_or_path, config=teacher_config)
+        teacher.to(args.device)
+    else:
+        teacher = None
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher=teacher)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+
+    # Save the trained model and the tokenizer
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        model.to(args.device)
+
+
+    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+
+        for checkpoint in checkpoints:
+            # Reload the model
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+
+            # Evaluate
+            result = evaluate(args, model, tokenizer, prefix=global_step)
+
+            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            results.update(result)
+
+    logger.info("Results: {}".format(results))
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/run_squad.py b/examples/run_squad.py
index 922a323087..bd9005cc29 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet) with an optional step of distillation."""
+""" Finetuning the library models for question-answering on SQuAD (DistilBERT, Bert, XLM, XLNet)."""
 
 from __future__ import absolute_import, division, print_function
 
@@ -28,8 +28,6 @@ import torch
 from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                               TensorDataset)
 from torch.utils.data.distributed import DistributedSampler
-import torch.nn.functional as F
-import torch.nn as nn
 from tqdm import tqdm, trange
 
 from tensorboardX import SummaryWriter
@@ -75,7 +73,7 @@ def set_seed(args):
 def to_list(tensor):
     return tensor.detach().cpu().tolist()
 
-def train(args, train_dataset, model, tokenizer, teacher=None):
+def train(args, train_dataset, model, tokenizer):
     """ Train the model """
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
@@ -134,8 +132,6 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
         for step, batch in enumerate(epoch_iterator):
             model.train()
-            if teacher is not None:
-                teacher.eval()
             batch = tuple(t.to(args.device) for t in batch)
             inputs = {'input_ids':       batch[0],
                       'attention_mask':  batch[1], 
@@ -147,27 +143,7 @@ def train(args, train_dataset, model, tokenizer, teacher=None):
                 inputs.update({'cls_index': batch[5],
                                'p_mask':       batch[6]})
             outputs = model(**inputs)
-            loss, start_logits_stu, end_logits_stu = outputs
-
-            # Distillation loss
-            if teacher is not None:
-                if 'token_type_ids' not in inputs:
-                    inputs['token_type_ids'] = None if args.teacher_type == 'xlm' else batch[2]
-                with torch.no_grad():
-                    start_logits_tea, end_logits_tea = teacher(input_ids=inputs['input_ids'],
-                                                               token_type_ids=inputs['token_type_ids'],
-                                                               attention_mask=inputs['attention_mask'])
-                assert start_logits_tea.size() == start_logits_stu.size()
-                assert end_logits_tea.size() == end_logits_stu.size()
-                
-                loss_fct = nn.KLDivLoss(reduction='batchmean')
-                loss_start = loss_fct(F.log_softmax(start_logits_stu/args.temperature, dim=-1),
-                                      F.softmax(start_logits_tea/args.temperature, dim=-1)) * (args.temperature**2)
-                loss_end = loss_fct(F.log_softmax(end_logits_stu/args.temperature, dim=-1),
-                                    F.softmax(end_logits_tea/args.temperature, dim=-1)) * (args.temperature**2)
-                loss_ce = (loss_start + loss_end)/2.
-
-                loss = args.alpha_ce*loss_ce + args.alpha_squad*loss
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
 
             if args.n_gpu > 1:
                 loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
@@ -367,18 +343,6 @@ def main():
     parser.add_argument("--output_dir", default=None, type=str, required=True,
                         help="The output directory where the model checkpoints and predictions will be written.")
 
-    # Distillation parameters (optional)
-    parser.add_argument('--teacher_type', default=None, type=str,
-                        help="Teacher type. Teacher tokenizer and student (model) tokenizer must output the same tokenization. Only for distillation.")
-    parser.add_argument('--teacher_name_or_path', default=None, type=str,
-                        help="Path to the already SQuAD fine-tuned teacher model. Only for distillation.")
-    parser.add_argument('--alpha_ce', default=0.5, type=float,
-                        help="Distillation loss linear weight. Only for distillation.")
-    parser.add_argument('--alpha_squad', default=0.5, type=float,
-                        help="True SQuAD loss linear weight. Only for distillation.")
-    parser.add_argument('--temperature', default=2.0, type=float,
-                        help="Distillation temperature. Only for distillation.")
-
     ## Other parameters
     parser.add_argument("--config_name", default="", type=str,
                         help="Pretrained config name or path if not the same as model_name")
@@ -506,17 +470,6 @@ def main():
     tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
     model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
 
-    if args.teacher_type is not None:
-        assert args.teacher_name_or_path is not None
-        assert args.alpha_ce > 0.
-        assert args.alpha_ce + args.alpha_squad > 0.
-        assert args.teacher_type != 'distilbert', "We constraint teachers not to be of type DistilBERT."
-        teacher_config_class, teacher_model_class, _ = MODEL_CLASSES[args.teacher_type]
-        teacher_config = teacher_config_class.from_pretrained(args.teacher_name_or_path)
-        teacher = teacher_model_class.from_pretrained(args.teacher_name_or_path, config=teacher_config)
-        teacher.to(args.device)
-    else:
-        teacher = None
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
@@ -528,7 +481,7 @@ def main():
     # Training
     if args.do_train:
         train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer, teacher=teacher)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
 

From 0820bb055585b97495660fb8e3aca42dd6764f98 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Fri, 4 Oct 2019 17:16:10 -0400
Subject: [PATCH 41/55] unecessary carriage return

---
 examples/run_squad.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index bd9005cc29..eb351b340c 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -470,7 +470,6 @@ def main():
     tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
     model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
 
-
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
 

From f3e0218fbb6bcc40b40f10089dae8876654edb23 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Sat, 5 Oct 2019 21:05:16 -0400
Subject: [PATCH 42/55] Correct device assignment in run_generation

---
 examples/run_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_generation.py b/examples/run_generation.py
index 33a0ae1816..de2f6b8869 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -125,7 +125,7 @@ def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=
                 inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}
 
             if xlm_lang is not None:
-                inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1]).view(1, -1)
+                inputs["langs"] = torch.tensor([xlm_lang] * inputs["input_ids"].shape[1], device=device).view(1, -1)
 
             outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
             next_token_logits = outputs[0][0, -1, :] / temperature

From 0f65d8cbbe8be704670e337ad4383568babf5789 Mon Sep 17 00:00:00 2001
From: Christopher Goh <chrisgzf@gmail.com>
Date: Mon, 7 Oct 2019 01:14:34 +0800
Subject: [PATCH 43/55] Fix some typos in README

---
 README.md | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/README.md b/README.md
index 4b4f6d5def..e311e68a85 100644
--- a/README.md
+++ b/README.md
@@ -67,7 +67,7 @@ This repo is tested on Python 2.7 and 3.5+ (examples are tested only on python 3
 ### With pip
 
 First you need to install one of, or both, TensorFlow 2.0 and PyTorch.
-Please refere to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
 
 When TensorFlow 2.0 and/or PyTorch has been installed, 🤗 Transformers can be installed using pip as follows:
 
@@ -78,7 +78,7 @@ pip install transformers
 ### From source
 
 Here also, you first need to install one of, or both, TensorFlow 2.0 and PyTorch.
-Please refere to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
+Please refer to [TensorFlow installation page](https://www.tensorflow.org/install/pip#tensorflow-2.0-rc-is-available) and/or [PyTorch installation page](https://pytorch.org/get-started/locally/#start-locally) regarding the specific install command for your platform.
 
 When TensorFlow 2.0 and/or PyTorch has been installed, you can install from source by cloning the repository and running:
 
@@ -423,7 +423,7 @@ Here is a quick summary of what you should take care of when migrating from `pyt
 
 ### Models always output `tuples`
 
-The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the models forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the model's forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
 
 The exact content of the tuples for each model is detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
 
@@ -459,9 +459,9 @@ By enabling the configuration option `output_hidden_states`, it was possible to
 
 Breaking change in the `from_pretrained()` method:
 
-1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
+1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them, don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
 
-2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
+2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead, which can break derived model classes built based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/transformers/pull/866) by forwarding the the model's `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
 
 Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
 

From 904158ac4dbce046dd02be8382fdb8e52f0e691c Mon Sep 17 00:00:00 2001
From: Christopher Goh <chrisgzf@gmail.com>
Date: Mon, 7 Oct 2019 11:03:49 +0800
Subject: [PATCH 44/55] Rephrase forward method to reduce ambiguity

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e311e68a85..d49849da22 100644
--- a/README.md
+++ b/README.md
@@ -423,7 +423,7 @@ Here is a quick summary of what you should take care of when migrating from `pyt
 
 ### Models always output `tuples`
 
-The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that the model's forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
+The main breaking change when migrating from `pytorch-pretrained-bert` to `transformers` is that every model's forward method always outputs a `tuple` with various elements depending on the model and the configuration parameters.
 
 The exact content of the tuples for each model is detailed in the models' docstrings and the [documentation](https://huggingface.co/transformers/).
 

From 6dc6c716c5d63e3de842efca016304f2f0621a10 Mon Sep 17 00:00:00 2001
From: seanBE <SeanBE@users.noreply.github.com>
Date: Mon, 7 Oct 2019 09:59:54 +0100
Subject: [PATCH 45/55] fix pytorch-transformers migration description in
 README

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d49849da22..b2b9bc9abe 100644
--- a/README.md
+++ b/README.md
@@ -56,7 +56,7 @@ Choose the right framework for every part of a model's lifetime
 | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
 | [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
 | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
-| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
+| [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
 | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
 | [Documentation](https://huggingface.co/transformers/) | Full API documentation and more |
 

From 7afd00a661c432f1bc5c87de9a8d08774084dfca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Mon, 7 Oct 2019 17:58:13 +0200
Subject: [PATCH 46/55] freeze dev requirements

---
 requirements-dev.txt | 48 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 48 insertions(+)
 create mode 100644 requirements-dev.txt

diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000000..30ae8bf740
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,48 @@
+absl-py==0.8.0
+astor==0.8.0
+atomicwrites==1.3.0
+attrs==19.2.0
+boto3==1.9.243
+botocore==1.12.243
+certifi==2019.9.11
+chardet==3.0.4
+Click==7.0
+docutils==0.15.2
+gast==0.2.2
+google-pasta==0.1.7
+grpcio==1.24.1
+h5py==2.10.0
+idna==2.8
+importlib-metadata==0.23
+jmespath==0.9.4
+joblib==0.14.0
+Keras-Applications==1.0.8
+Keras-Preprocessing==1.1.0
+Markdown==3.1.1
+more-itertools==7.2.0
+numpy==1.17.2
+opt-einsum==3.1.0
+packaging==19.2
+pluggy==0.13.0
+protobuf==3.10.0
+py==1.8.0
+pyparsing==2.4.2
+pytest==5.2.1
+python-dateutil==2.8.0
+regex==2019.8.19
+requests==2.22.0
+s3transfer==0.2.1
+sacremoses==0.0.35
+sentencepiece==0.1.83
+six==1.12.0
+tensorboard==2.0.0
+tensorflow==2.0.0
+tensorflow-estimator==2.0.0
+termcolor==1.1.0
+torch==1.2.0
+tqdm==4.36.1
+urllib3==1.25.6
+wcwidth==0.1.7
+Werkzeug==0.16.0
+wrapt==1.11.2
+zipp==0.6.0

From 9f81f1cba8a5f6ffc3c449909489343555745df5 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Mon, 7 Oct 2019 12:30:19 -0400
Subject: [PATCH 47/55] fix convert pt_to_tf2 for custom weights

---
 transformers/convert_pytorch_checkpoint_to_tf2.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index d8a48e9dcd..b7e0e79183 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -173,10 +173,12 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
             else:
                 model_file = cached_path(model_shortcut_name, force_download=not use_cached_models)
 
-            convert_pt_checkpoint_to_tf(model_type,
-                                        model_file,
-                                        config_file,
-                                        os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
+            if os.path.isfile(model_shortcut_name):
+                model_shortcut_name = 'converted_model'
+            convert_pt_checkpoint_to_tf(model_type=model_type,
+                                        pytorch_checkpoint_path=model_file,
+                                        config_file=config_file,
+                                        tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
                                         compare_with_pt_model=compare_with_pt_model)
             os.remove(config_file)
             os.remove(model_file)

From 7ce83b4931fe675955e82e1aac07e6c8fe972c9c Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Mon, 7 Oct 2019 12:30:27 -0400
Subject: [PATCH 48/55] update weights for distilgpt2

---
 examples/distillation/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/distillation/README.md b/examples/distillation/README.md
index 5b6fbd2e9a..86a729e830 100644
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -10,7 +10,7 @@ This folder contains the original code used to train Distil* as well as examples
 
 Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
 
-We have applied the same method to GPT2 and release the weights of the compressed model. On the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test of 15.8 compared to 19.3 for DistilGPT2 (after fine-tuning on the train set).
+We have applied the same method to GPT2 and release the weights of the compressed model. On the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for DistilGPT2 (after fine-tuning on the train set).
 
 For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108). The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances.
 

From 8fcc6507ce9d0922ddb60f4a31d4b9a839de1270 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 7 Oct 2019 15:02:42 -0400
Subject: [PATCH 49/55] Multilingual

---
 docs/source/index.rst        |   1 +
 docs/source/multilingual.rst | 103 +++++++++++++++++++++++++++++++++++
 2 files changed, 104 insertions(+)
 create mode 100644 docs/source/multilingual.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 3b4fe4d1e8..80d68884f8 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -62,6 +62,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
     migration
     bertology
     torchscript
+    multilingual
 
 .. toctree::
     :maxdepth: 2
diff --git a/docs/source/multilingual.rst b/docs/source/multilingual.rst
new file mode 100644
index 0000000000..f6f72b2434
--- /dev/null
+++ b/docs/source/multilingual.rst
@@ -0,0 +1,103 @@
+Multi-lingual models
+================================================
+
+Most of the models available in this library are mono-lingual models (English, Chinese and German). A few
+multi-lingual models are available and have a different mechanisms than mono-lingual models.
+This page details the usage of these models.
+
+The two models that currently support multiple languages are BERT and XLM.
+
+XLM
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+XLM has a total of 10 different checkpoints, only one of which is mono-lingual. The 9 remaining model checkpoints can
+be split in two categories: the checkpoints that make use of language embeddings, and those that don't
+
+XLM & Language Embeddings
+------------------------------------------------
+
+This section concerns the following checkpoints:
+
+- ``xlm-mlm-ende-1024`` (Masked language modeling, English-German)
+- ``xlm-mlm-enfr-1024`` (Masked language modeling, English-French)
+- ``xlm-mlm-enro-1024`` (Masked language modeling, English-Romanian)
+- ``xlm-mlm-xnli15-1024`` (Masked language modeling, XNLI languages)
+- ``xlm-mlm-tlm-xnli15-1024`` (Masked language modeling + Translation, XNLI languages)
+- ``xlm-clm-enfr-1024`` (Causal language modeling, English-French)
+- ``xlm-clm-ende-1024`` (Causal language modeling, English-German)
+
+These checkpoints require language embeddings that will specify the language used at inference time. These language
+embeddings are represented as a tensor that is of the same shape as the input ids passed to the model. The values in
+these tensors depend on the language used and are identifiable using the ``lang2id`` and ``id2lang`` attributes
+from the tokenizer.
+
+Here is an example using the ``xlm-clm-enfr-1024`` checkpoint (Causal language modeling, English-French):
+
+
+.. code-block::
+
+    import torch
+    from transformers import XLMTokenizer, XLMWithLMHeadModel
+
+    tokenizer = XLMTokenizer.from_pretrained("xlm-clm-1024-enfr")
+
+
+The different languages this model/tokenizer handles, as well as the ids of these languages are visible using the
+``lang2id`` attribute:
+
+.. code-block::
+
+    print(tokenizer.lang2id)  # {'en': 0, 'fr': 1}
+
+
+These ids should be used when passing a language parameter during a model pass. Let's define our inputs:
+
+.. code-block::
+
+    input_ids = torch.tensor([tokenizer.encode("Wikipedia was used to")]) # batch size of 1
+
+
+We should now define the language embedding by using the previously defined language id. We want to create a tensor
+filled with the appropriate language ids, of the same size as input_ids. For english, the id is 0:
+
+.. code-block::
+
+    language_id = tokenizer.lang2id['en']  # 0
+    langs = torch.tensor([language_id] * input_ids.shape[1])  # torch.tensor([0, 0, 0, ..., 0])
+
+    # We reshape it to be of size (batch_size, sequence_length)
+    langs = langs.view(1, -1) # is now of shape [1, sequence_length] (we have a batch size of 1)
+
+
+You can then feed it all as input to your model:
+
+.. code-block::
+
+    outputs = model(input_ids, langs=langs)
+
+
+The example `run_generation.py <https://github.com/huggingface/transformers/blob/master/examples/run_generation.py>`__
+can generate text using the CLM checkpoints from XLM, using the language embeddings.
+
+XLM without Language Embeddings
+------------------------------------------------
+
+This section concerns the following checkpoints:
+
+- ``xlm-mlm-17-1280`` (Masked language modeling, 17 languages)
+- ``xlm-mlm-100-1280`` (Masked language modeling, 100 languages)
+
+These checkpoints do not require language embeddings at inference time. These models are used to have generic
+sentence representations, differently from previously-mentioned XLM checkpoints.
+
+
+BERT
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+BERT has two checkpoints that can be used for multi-lingual tasks:
+
+- ``bert-base-multilingual-uncased`` (Masked language modeling + Next sentence prediction, 102 languages)
+- ``bert-base-multilingual-cased`` (Masked language modeling + Next sentence prediction, 104 languages)
+
+These checkpoints do not require language embeddings at inference time. They should identify the language
+used in the context and infer accordingly.
\ No newline at end of file

From e9c09052a4bf8531bdb94c1789f4138986ebe630 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Mon, 7 Oct 2019 22:16:12 +0200
Subject: [PATCH 50/55] add issues and requests guidelines

---
 CONTRIBUTING.md | 86 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 86 insertions(+)
 create mode 100644 CONTRIBUTING.md

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
new file mode 100644
index 0000000000..70da6dc8fb
--- /dev/null
+++ b/CONTRIBUTING.md
@@ -0,0 +1,86 @@
+# How to contribute to transformers?
+
+Everyone is welcome to contribute, and we value everybody's contribution. Code
+is thus not the only way to contribute. Answering questions, helping others,
+reaching out and improving the documentations are immensely valuable to the
+community.
+
+It also helps us if you spread the word: reference the library from blog posts
+on the awesome projects it made possible, shout out on twitter every time it has
+helped you, or simply star the repo to say "thank you".
+
+## You can contribute in so many ways!
+
+There are 4 ways you can contribute to transformers:
+* Fixing outstanding issues with the existing code;
+* Implementing new models;
+* Contributing to the examples, or to the documentation;
+* Submitting issues related to bugs or desired new features.
+
+*All are equally valuable to the community.*
+
+## Submitting a new issue or feature request
+
+Do your best to follow these guidelines when submitting an issue or a feature
+request. It will make it easier for us to come back to you quickly and with good
+feedback.
+
+### Did you find a bug?
+
+The transformers are robust and reliable thanks to the users who notify us of
+the problems they encounter.
+
+So thank you for reporting an issue. First, we would really appreciate it if you
+could **make sure the bug was not already reported** (use the search bar on
+Github under Issues).
+
+Did not find it? :( So we can act quickly on it, please follow these steps:
+
+* Include your **OS type and version**, the versions of **Python**, **PyTorch** and
+  **Tensorflow** when applicable;
+* A short, self-contained, code snippet that allows us to reproduce the bug in
+  less than 30s.
+* Provide the *full* traceback if an exception is raised.
+
+To get the OS and software versions, execute the following code and copy-paste
+the output:
+
+```
+import platform; print("Platform", platform.platform())
+import sys; print("Python", sys.version)
+import torch; print("PyTorch", torch.__version__)
+import tensorflow; print("Tensorflow", tensorflow.__version__)
+```
+
+### Do you want to implement a new model?
+
+Please provide the following:
+
+* Short description of the model and link to the paper
+* Link to the implementation if open-source
+* Link to the model weights if they are available
+
+Let us know if you are willing to contribute so we can best guide you. 
+
+### Do you want a new feature (that is not a model)?
+
+A world-class feature request addresses the following points:
+
+1. Motivation first:
+  * Is it related to a problem/frustration with the library? If so, please explain
+    why. Providing a code snippet that demonstrates the problem is best.
+  * Is it related to something you would need for a project? We'd love to hear
+    about it!
+  * Is it something you worked on and think could benefit the community?
+    Awesome! Tell us what problem it solved for you.
+2. Write a *full paragraph* describing the feature.  
+3. Provide a **code snippet** that demonstrates its future use.
+4. In case this is related to a paper, please provide a link
+5. Attach any additional information (drawings, screenshots, etc.) you think may help.
+
+If your issue is well-written we're already 80% of the way there by the time you
+post it.
+
+## Contributing code
+
+## Contributing examples

From ade05b6cef0a5576532ed69588b95b54f694d0c9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Mon, 7 Oct 2019 23:20:25 +0200
Subject: [PATCH 51/55] add code contribution

---
 CONTRIBUTING.md | 87 +++++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 85 insertions(+), 2 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 70da6dc8fb..fa0081b24c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -81,6 +81,89 @@ A world-class feature request addresses the following points:
 If your issue is well-written we're already 80% of the way there by the time you
 post it.
 
-## Contributing code
+## Start contributing! (Pull Requests)
 
-## Contributing examples
+Before writing code, we strongly advise you to search through the exising PRs or
+issues to make sure that nobody is already working on the same thing. It is
+always a good idea to open an issue to get some feedback.
+
+You will need basic `git` proficiency to be able to contribute to
+`transformers`. `git` is not the easiest tool to use but it has the greatest
+manual. Type `git --help` in a shell and enjoy. If you prefer books, [Pro
+Git](https://git-scm.com/book/en/v2) is a very good reference.
+
+Follow these steps to start contributing:
+
+1. Fork the [repository](https://github.com/huggingface/transformers) by
+   clicking on the 'Fork' button. This creates a copy of the code
+   under your github user account.
+2. Clone your fork to your local disk, and add the base repository as a remote:
+   
+   ```bash
+   $ git clone git@github.com:<your Github handle>/transformers.git
+   $ cd transformers
+   $ git remote add upstream git@github.com:huggingface/transformers.git
+   ```
+
+3. Create a new branch to hold your development changes:
+
+   ```bash
+   $ git checkout -b a-descriptive-name-for-my-changes
+   ```
+   
+   **do not** work on the `master` branch.
+   
+4. Set up a development environment by running in a virtual environment:
+
+   ```bash
+   $ pip install -r requirements-dev.txt
+   ```
+
+5. Develop the features on your branch. Add changed files using `git add` and
+   then `git commit` to record your changes locally:
+   
+   ```bash
+   $ git add modified_file.py
+   $ git commit
+   ```
+   
+   Please write [good commit
+   messages](https://chris.beams.io/posts/git-commit/). It
+   is a good idea to sync your copy of the code with the original repository
+   regularly. This way you can quickly account for changes:
+   
+   ```bash
+   $ git fetch upstream
+   $ git rebase upstream/master
+   ```
+   
+   Push the changes to your account using:
+   
+   ```bash
+   $ git push -u origin a-descriptive-name-for-my-changes
+   ```
+   
+6. Once you are satisfied (**and the checklist below is happy too**), go to the
+   webpage of your fork on Github. Click on 'Pull request' to send your changes
+   to the project maintainers for review.
+
+
+### Checklist
+
+1. The title of your pull request should be a summary of its contribution;
+2. If your pull request adresses an issue, please mention the issue number in
+   the pull request description to make sure they are linked;
+3. To indicate a work in progress please prefix the title with `[WIP]`. These
+   are useful to avoid duplicated work, and to differentiate it from PRs ready
+   to be merged;
+4. Make sure pre-existing tests still pass;
+5. Add high-coverage tests. No quality test, no merge;
+6. All public methods must have informative doctrings;
+
+
+### Style guide
+
+For documentation strings, `transformers` follows the [google
+style](https://google.github.io/styleguide/pyguide.html).
+
+#### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/master/CONTRIBUTING.md)

From 45de313a9edcc7a57b09a2a80be3aa865cf8c12c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Tue, 8 Oct 2019 11:54:10 +0200
Subject: [PATCH 52/55] add bullet point on modifying an existing PR

---
 CONTRIBUTING.md | 54 +++++++++++++++++++++++++++----------------------
 1 file changed, 30 insertions(+), 24 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index fa0081b24c..817ba56aaf 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -1,12 +1,12 @@
 # How to contribute to transformers?
 
 Everyone is welcome to contribute, and we value everybody's contribution. Code
-is thus not the only way to contribute. Answering questions, helping others,
-reaching out and improving the documentations are immensely valuable to the
-community.
+is thus not the only way to help the community. Answering questions, helping
+others, reaching out and improving the documentations are immensely valuable to
+the community.
 
 It also helps us if you spread the word: reference the library from blog posts
-on the awesome projects it made possible, shout out on twitter every time it has
+on the awesome projects it made possible, shout out on Twitter every time it has
 helped you, or simply star the repo to say "thank you".
 
 ## You can contribute in so many ways!
@@ -14,7 +14,7 @@ helped you, or simply star the repo to say "thank you".
 There are 4 ways you can contribute to transformers:
 * Fixing outstanding issues with the existing code;
 * Implementing new models;
-* Contributing to the examples, or to the documentation;
+* Contributing to the examples or to the documentation;
 * Submitting issues related to bugs or desired new features.
 
 *All are equally valuable to the community.*
@@ -28,18 +28,17 @@ feedback.
 ### Did you find a bug?
 
 The transformers are robust and reliable thanks to the users who notify us of
-the problems they encounter.
+the problems they encounter. So thank you for reporting an issue.
 
-So thank you for reporting an issue. First, we would really appreciate it if you
-could **make sure the bug was not already reported** (use the search bar on
-Github under Issues).
+First, we would really appreciate it if you could **make sure the bug was not
+already reported** (use the search bar on Github under Issues).
 
 Did not find it? :( So we can act quickly on it, please follow these steps:
 
 * Include your **OS type and version**, the versions of **Python**, **PyTorch** and
   **Tensorflow** when applicable;
 * A short, self-contained, code snippet that allows us to reproduce the bug in
-  less than 30s.
+  less than 30s;
 * Provide the *full* traceback if an exception is raised.
 
 To get the OS and software versions, execute the following code and copy-paste
@@ -54,13 +53,14 @@ import tensorflow; print("Tensorflow", tensorflow.__version__)
 
 ### Do you want to implement a new model?
 
-Please provide the following:
+Awesome! Please provide the following information:
 
-* Short description of the model and link to the paper
-* Link to the implementation if open-source
-* Link to the model weights if they are available
+* Short description of the model and link to the paper;
+* Link to the implementation if it is open-source;
+* Link to the model weights if they are available.
 
-Let us know if you are willing to contribute so we can best guide you. 
+If you are willing to contribute the model yourself, let us know so we can best
+guide you.
 
 ### Do you want a new feature (that is not a model)?
 
@@ -73,19 +73,19 @@ A world-class feature request addresses the following points:
     about it!
   * Is it something you worked on and think could benefit the community?
     Awesome! Tell us what problem it solved for you.
-2. Write a *full paragraph* describing the feature.  
-3. Provide a **code snippet** that demonstrates its future use.
-4. In case this is related to a paper, please provide a link
+2. Write a *full paragraph* describing the feature;
+3. Provide a **code snippet** that demonstrates its future use;
+4. In case this is related to a paper, please attach a link;
 5. Attach any additional information (drawings, screenshots, etc.) you think may help.
 
-If your issue is well-written we're already 80% of the way there by the time you
+If your issue is well written we're already 80% of the way there by the time you
 post it.
 
 ## Start contributing! (Pull Requests)
 
 Before writing code, we strongly advise you to search through the exising PRs or
-issues to make sure that nobody is already working on the same thing. It is
-always a good idea to open an issue to get some feedback.
+issues to make sure that nobody is already working on the same thing. If you are
+unsure, it is always a good idea to open an issue to get some feedback.
 
 You will need basic `git` proficiency to be able to contribute to
 `transformers`. `git` is not the easiest tool to use but it has the greatest
@@ -95,7 +95,7 @@ Git](https://git-scm.com/book/en/v2) is a very good reference.
 Follow these steps to start contributing:
 
 1. Fork the [repository](https://github.com/huggingface/transformers) by
-   clicking on the 'Fork' button. This creates a copy of the code
+   clicking on the 'Fork' button on the repository's page. This creates a copy of the code
    under your github user account.
 2. Clone your fork to your local disk, and add the base repository as a remote:
    
@@ -113,7 +113,7 @@ Follow these steps to start contributing:
    
    **do not** work on the `master` branch.
    
-4. Set up a development environment by running in a virtual environment:
+4. Set up a development environment by running the following command in a virtual environment:
 
    ```bash
    $ pip install -r requirements-dev.txt
@@ -146,13 +146,19 @@ Follow these steps to start contributing:
 6. Once you are satisfied (**and the checklist below is happy too**), go to the
    webpage of your fork on Github. Click on 'Pull request' to send your changes
    to the project maintainers for review.
+   
+7. It's ok if maintainers ask you for changes. It happens to core contributors
+   too! So everyone can see the changes in the Pull request, work in your local
+   branch and push the changes to your fork. They will automatically appear in
+   the pull request.
 
 
 ### Checklist
 
 1. The title of your pull request should be a summary of its contribution;
 2. If your pull request adresses an issue, please mention the issue number in
-   the pull request description to make sure they are linked;
+   the pull request description to make sure they are linked (and people
+   consulting the issue know you are working on it);
 3. To indicate a work in progress please prefix the title with `[WIP]`. These
    are useful to avoid duplicated work, and to differentiate it from PRs ready
    to be merged;

From d688af19e5ce92c1395820a89e3f3b635eacc2ba Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 8 Oct 2019 16:37:52 -0400
Subject: [PATCH 53/55] Update link to swift-coreml-transformers

cc @lysandrejik
---
 README.md                    |  2 +-
 docs/source/installation.md  | 58 +++++++++++++++++++++++++++++
 docs/source/installation.rst | 71 ------------------------------------
 3 files changed, 59 insertions(+), 72 deletions(-)
 create mode 100644 docs/source/installation.md
 delete mode 100644 docs/source/installation.rst

diff --git a/README.md b/README.md
index b2b9bc9abe..87d6e18a55 100644
--- a/README.md
+++ b/README.md
@@ -105,7 +105,7 @@ python -m pytest -sv ./examples/
 
 You should check out our [`swift-coreml-transformers`](https://github.com/huggingface/swift-coreml-transformers) repo.
 
-It contains an example of a conversion script from a Pytorch trained Transformer model (here, `GPT-2`) to a CoreML model that runs on iOS devices.
+It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
 
 At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models to productizing them in CoreML, or prototype a model or an app in CoreML then research its hyperparameters or architecture from TensorFlow 2.0 and/or PyTorch. Super exciting!
 
diff --git a/docs/source/installation.md b/docs/source/installation.md
new file mode 100644
index 0000000000..11beb1ab3a
--- /dev/null
+++ b/docs/source/installation.md
@@ -0,0 +1,58 @@
+# Installation
+
+Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
+
+## With pip
+
+PyTorch Transformers can be installed using pip as follows:
+
+``` bash
+pip install transformers
+```
+
+## From source
+
+To install from source, clone the repository and install with:
+
+``` bash
+git clone https://github.com/huggingface/transformers.git
+cd transformers
+pip install [--editable] .
+```
+
+## Tests
+
+An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the [tests folder](https://github.com/huggingface/transformers/tree/master/transformers/tests) and examples tests in the [examples folder](https://github.com/huggingface/transformers/tree/master/examples).
+
+Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
+
+Run all the tests from the root of the cloned repository with the commands:
+
+``` bash
+python -m pytest -sv ./transformers/tests/
+python -m pytest -sv ./examples/
+```
+
+## OpenAI GPT original tokenization workflow
+
+If you want to reproduce the original tokenization process of the `OpenAI GPT` paper, you will need to install `ftfy` (use version 4.4.3 if you are using Python 2) and `SpaCy`:
+
+``` bash
+pip install spacy ftfy==4.4.3
+python -m spacy download en
+```
+
+If you don't install `ftfy` and `SpaCy`, the `OpenAI GPT` tokenizer will default to tokenize using BERT's `BasicTokenizer` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
+
+## Note on model downloads (Continuous Integration or large-scale deployments)
+
+If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
+
+## Do you want to run a Transformer model on a mobile device?
+
+You should check out our [swift-coreml-transformers](https://github.com/huggingface/swift-coreml-transformers) repo.
+
+It contains a set of tools to convert PyTorch or TensorFlow 2.0 trained Transformer models (currently contains `GPT-2`, `DistilGPT-2`, `BERT`, and `DistilBERT`) to CoreML models that run on iOS devices.
+
+At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
+or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
diff --git a/docs/source/installation.rst b/docs/source/installation.rst
deleted file mode 100644
index 51f7eb520d..0000000000
--- a/docs/source/installation.rst
+++ /dev/null
@@ -1,71 +0,0 @@
-Installation
-================================================
-
-Transformers is tested on Python 2.7 and 3.5+ (examples are tested only on python 3.5+) and PyTorch 1.1.0
-
-With pip
-^^^^^^^^
-
-PyTorch Transformers can be installed using pip as follows:
-
-.. code-block:: bash
-
-   pip install transformers
-
-From source
-^^^^^^^^^^^
-
-To install from source, clone the repository and install with:
-
-.. code-block:: bash
-
-    git clone https://github.com/huggingface/transformers.git
-    cd transformers
-    pip install [--editable] .
-
-
-Tests
-^^^^^
-
-An extensive test suite is included to test the library behavior and several examples. Library tests can be found in the `tests folder <https://github.com/huggingface/transformers/tree/master/transformers/tests>`_ and examples tests in the `examples folder <https://github.com/huggingface/transformers/tree/master/examples>`_.
-
-Tests can be run using `pytest` (install pytest if needed with `pip install pytest`).
-
-Run all the tests from the root of the cloned repository with the commands:
-
-.. code-block:: bash
-
-    python -m pytest -sv ./transformers/tests/
-    python -m pytest -sv ./examples/
-
-
-OpenAI GPT original tokenization workflow
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you want to reproduce the original tokenization process of the ``OpenAI GPT`` paper, you will need to install ``ftfy`` (use version 4.4.3 if you are using Python 2) and ``SpaCy`` :
-
-.. code-block:: bash
-
-   pip install spacy ftfy==4.4.3
-   python -m spacy download en
-
-If you don't install ``ftfy`` and ``SpaCy``\ , the ``OpenAI GPT`` tokenizer will default to tokenize using BERT's ``BasicTokenizer`` followed by Byte-Pair Encoding (which should be fine for most usage, don't worry).
-
-
-Note on model downloads (Continuous Integration or large-scale deployments)
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you expect to be downloading large volumes of models (more than 1,000) from our hosted bucket (for instance through your CI setup, or a large-scale production deployment), please cache the model files on your end. It will be way faster, and cheaper. Feel free to contact us privately if you need any help.
-
-
-Do you want to run a Transformer model on a mobile device?
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-You should check out our `swift-coreml-transformers <https://github.com/huggingface/swift-coreml-transformers>`_ repo.
-
-It contains an example of a conversion script from a Pytorch trained Transformer model (here, ``GPT-2``) to a CoreML model that runs on iOS devices.
-
-It also contains an implementation of BERT for Question answering.
-
-At some point in the future, you'll be able to seamlessly move from pre-training or fine-tuning models in PyTorch to productizing them in CoreML,
-or prototype a model or an app in CoreML then research its hyperparameters or architecture from PyTorch. Super exciting!
\ No newline at end of file

From 23b7138ab495a5f39b648624a8dac73ce8d24f33 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 9 Oct 2019 01:54:44 +0200
Subject: [PATCH 54/55] fix #1378 and #1453

---
 transformers/modeling_tf_distilbert.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py
index 6ed2844567..fa2dc674af 100644
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -226,8 +226,9 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
 
         dim_per_head = self.dim // self.n_heads
 
-        assert 2 <= len(tf.shape(mask)) <= 3
-        causal = (len(tf.shape(mask)) == 3)
+        mask_shape = shape_list(mask)
+        assert 2 <= len(mask_shape) <= 3
+        causal = (mask_shape) == 3)
         mask_reshape = [bs, 1, 1, k_length]
 
         def shape(x):

From 1c5079952f5f10eeac4cb6801b4fd1f36b0eff73 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 9 Oct 2019 04:26:20 +0200
Subject: [PATCH 55/55] simpler distilbert mask - fix tf tests

---
 transformers/modeling_distilbert.py    | 2 --
 transformers/modeling_tf_distilbert.py | 3 ---
 2 files changed, 5 deletions(-)

diff --git a/transformers/modeling_distilbert.py b/transformers/modeling_distilbert.py
index ebb89f0f95..d3b4ccff5d 100644
--- a/transformers/modeling_distilbert.py
+++ b/transformers/modeling_distilbert.py
@@ -159,8 +159,6 @@ class MultiHeadSelfAttention(nn.Module):
 
         dim_per_head = self.dim // self.n_heads
 
-        assert 2 <= mask.dim() <= 3
-        causal = (mask.dim() == 3)
         mask_reshp = (bs, 1, 1, k_length)
 
         def shape(x):
diff --git a/transformers/modeling_tf_distilbert.py b/transformers/modeling_tf_distilbert.py
index fa2dc674af..f9fe4ca9e9 100644
--- a/transformers/modeling_tf_distilbert.py
+++ b/transformers/modeling_tf_distilbert.py
@@ -226,9 +226,6 @@ class TFMultiHeadSelfAttention(tf.keras.layers.Layer):
 
         dim_per_head = self.dim // self.n_heads
 
-        mask_shape = shape_list(mask)
-        assert 2 <= len(mask_shape) <= 3
-        causal = (mask_shape) == 3)
         mask_reshape = [bs, 1, 1, k_length]
 
         def shape(x):