From a9f24a16bc2965d2990b90127ed4b5a1f47344b9 Mon Sep 17 00:00:00 2001
From: mataney <mataneyal1@gmail.com>
Date: Wed, 25 Sep 2019 15:53:29 +0300
Subject: [PATCH 001/144] [FIX] fix run_generation.py to work with batch_size >
 1

---
 examples/run_generation.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/examples/run_generation.py b/examples/run_generation.py
index a2a8f29103..935e578441 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -81,7 +81,6 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')
                 Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
         From: https://gist.github.com/thomwolf/1a5a29f6962089e871b94cbd09daf317
     """
-    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
     top_k = min(top_k, logits.size(-1))  # Safety check
     if top_k > 0:
         # Remove all tokens with a probability less than the last token of the top-k
@@ -98,7 +97,8 @@ def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')
         sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
         sorted_indices_to_remove[..., 0] = 0
 
-        indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        # scatter sorted tensors to original indexing
+        indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
         logits[indices_to_remove] = filter_value
     return logits
 
@@ -122,10 +122,10 @@ def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=
                 inputs = {'input_ids': input_ids, 'perm_mask': perm_mask, 'target_mapping': target_mapping}
 
             outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet (cached hidden-states)
-            next_token_logits = outputs[0][0, -1, :] / temperature
+            next_token_logits = outputs[0][:, -1, :] / temperature
             filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
             next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
-            generated = torch.cat((generated, next_token.unsqueeze(0)), dim=1)
+            generated = torch.cat((generated, next_token), dim=1)
     return generated
 
 
@@ -139,6 +139,7 @@ def main():
     parser.add_argument("--padding_text", type=str, default="")
     parser.add_argument("--length", type=int, default=20)
     parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--num_samples", type=int, default=1)
     parser.add_argument("--top_k", type=int, default=0)
     parser.add_argument("--top_p", type=float, default=0.9)
     parser.add_argument("--no_cuda", action='store_true',
@@ -176,6 +177,7 @@ def main():
         out = sample_sequence(
             model=model,
             context=context_tokens,
+            num_samples=args.num_samples,
             length=args.length,
             temperature=args.temperature,
             top_k=args.top_k,
@@ -183,9 +185,10 @@ def main():
             device=args.device,
             is_xlnet=bool(args.model_type == "xlnet"),
         )
-        out = out[0, len(context_tokens):].tolist()
-        text = tokenizer.decode(out, clean_up_tokenization_spaces=True)
-        print(text)
+        out = out[:, len(context_tokens):].tolist()
+        for o in out:
+            text = tokenizer.decode(o, clean_up_tokenization_spaces=True)
+            print(text)
         if args.prompt:
             break
     return text

From 1dea291a0243ad0f17abb9b7bd6ddecdf6fbe516 Mon Sep 17 00:00:00 2001
From: Santiago Castro <sacastro@umich.edu>
Date: Sun, 6 Oct 2019 13:35:01 -0400
Subject: [PATCH 002/144] Remove unnecessary use of FusedLayerNorm in XLNet

---
 transformers/modeling_xlnet.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py
index d6bb2ebd38..2743b3f86e 100644
--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -188,11 +188,8 @@ def swish(x):
 ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish}
 
 
-try:
-    from apex.normalization.fused_layer_norm import FusedLayerNorm as XLNetLayerNorm
-except (ImportError, AttributeError) as e:
-    logger.info("Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .")
-    from torch.nn import LayerNorm as XLNetLayerNorm
+XLNetLayerNorm = nn.LayerNorm
+
 
 class XLNetRelativeAttention(nn.Module):
     def __init__(self, config):

From 4446c02b8a277325d7b145554b301919121902c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Mon, 7 Oct 2019 12:04:05 +0200
Subject: [PATCH 003/144] add wireframe for seq2seq model

---
 transformers/modeling_seq2seq.py            | 37 +++++++++++++++++++++
 transformers/tests/modeling_seq2seq_test.py | 23 +++++++++++++
 2 files changed, 60 insertions(+)
 create mode 100644 transformers/modeling_seq2seq.py
 create mode 100644 transformers/tests/modeling_seq2seq_test.py

diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
new file mode 100644
index 0000000000..990f35ffed
--- /dev/null
+++ b/transformers/modeling_seq2seq.py
@@ -0,0 +1,37 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conditional generation class. """
+
+
+class Seq2SeqModel(object):
+    def __init__(self):
+        # need to use from_pretrained to initialize
+        raise NotImplementedError
+
+    @classmethod
+    def from_pretrained(cls, encoder, decoder):
+        # Here we should call AutoModel to initialize the models depending
+        # on the pretrained models taken as an input.
+        # For a first iteration we only work with Bert.
+        raise NotImplementedError
+
+    def __call__(self):
+        # allows to call an instance of the class
+        # model = Seq2Seq(encode='bert', decoder='bert')
+        raise NotImplementedError
+
+    def process(self):
+        # alternative API to __call__ it is more explicit.
+        raise NotImplementedError
diff --git a/transformers/tests/modeling_seq2seq_test.py b/transformers/tests/modeling_seq2seq_test.py
new file mode 100644
index 0000000000..1866dc10af
--- /dev/null
+++ b/transformers/tests/modeling_seq2seq_test.py
@@ -0,0 +1,23 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+
+class Seq2SeqTest(unittest.TestCase):
+    raise NotImplementedError
+
+
+def __main__():
+    unittest.main()

From 386e86e22288b08bf8bcc3cff4027c46ba866d91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Mon, 7 Oct 2019 13:00:06 +0200
Subject: [PATCH 004/144] raise exception when class initialized with __init__

---
 transformers/modeling_seq2seq.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
index 990f35ffed..b14622e50f 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_seq2seq.py
@@ -17,11 +17,13 @@
 
 class Seq2SeqModel(object):
     def __init__(self):
-        # need to use from_pretrained to initialize
-        raise NotImplementedError
+        raise EnvironmentError(
+            """Seq2Seq is designed to be instantiated using the
+        `Seq2Seq.from_pretrained(encoder_name_or_path, decoder_name_or_path)` method."""
+        )
 
     @classmethod
-    def from_pretrained(cls, encoder, decoder):
+    def from_pretrained(cls, encoder_name, decoder_name):
         # Here we should call AutoModel to initialize the models depending
         # on the pretrained models taken as an input.
         # For a first iteration we only work with Bert.

From 0053c0e052d0c4c0eb63faaaf61b8140122f8444 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Mon, 7 Oct 2019 16:29:15 +0200
Subject: [PATCH 005/144] do some (light) housekeeping

Several packages were imported but never used, indentation and line
spaces did not follow PEP8.
---
 transformers/modeling_bert.py | 59 ++++++++++++++++++++---------------
 1 file changed, 34 insertions(+), 25 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index fc448fa366..3187d1ca50 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -17,12 +17,10 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
 import math
 import os
 import sys
-from io import open
 
 import torch
 from torch import nn
@@ -50,6 +48,7 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
 }
 
+
 def load_tf_weights_in_bert(model, config, tf_checkpoint_path):
     """ Load tf checkpoints in a pytorch model.
     """
@@ -125,12 +124,14 @@ def gelu(x):
     """
     return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))
 
+
 def gelu_new(x):
     """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
         Also see https://arxiv.org/abs/1606.08415
     """
     return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
 
+
 def swish(x):
     return x * torch.sigmoid(x)
 
@@ -140,6 +141,7 @@ ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_
 
 BertLayerNorm = torch.nn.LayerNorm
 
+
 class BertEmbeddings(nn.Module):
     """Construct the embeddings from word, position and token_type embeddings.
     """
@@ -482,7 +484,7 @@ BERT_START_DOCSTRING = r"""    The BERT model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -496,13 +498,13 @@ BERT_INPUTS_DOCSTRING = r"""
             (a) For sequence pairs:
 
                 ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-                
+
                 ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
 
             (b) For single sequences:
 
                 ``tokens:         [CLS] the dog is hairy . [SEP]``
-                
+
                 ``token_type_ids:   0   0   0   0  0     0   0``
 
             Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
@@ -601,7 +603,7 @@ class BertModel(BertPreTrainedModel):
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
         # Prepare head mask if needed
@@ -615,7 +617,7 @@ class BertModel(BertPreTrainedModel):
                 head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype)  # switch to fload if need + fp16 compatibility
         else:
             head_mask = [None] * self.config.num_hidden_layers
 
@@ -631,8 +633,9 @@ class BertModel(BertPreTrainedModel):
 
 
 @add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
-    a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+                       a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
+                      BERT_START_DOCSTRING,
+                      BERT_INPUTS_DOCSTRING)
 class BertForPreTraining(BertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -692,7 +695,7 @@ class BertForPreTraining(BertPreTrainedModel):
         outputs = self.bert(input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
+                            position_ids=position_ids,
                             head_mask=head_mask)
 
         sequence_output, pooled_output = outputs[:2]
@@ -711,7 +714,8 @@ class BertForPreTraining(BertPreTrainedModel):
 
 
 @add_start_docstrings("""Bert Model with a `language modeling` head on top. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+                      BERT_START_DOCSTRING,
+                      BERT_INPUTS_DOCSTRING)
 class BertForMaskedLM(BertPreTrainedModel):
     r"""
         **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -764,7 +768,7 @@ class BertForMaskedLM(BertPreTrainedModel):
         outputs = self.bert(input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
+                            position_ids=position_ids,
                             head_mask=head_mask)
 
         sequence_output = outputs[0]
@@ -780,7 +784,8 @@ class BertForMaskedLM(BertPreTrainedModel):
 
 
 @add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+                      BERT_START_DOCSTRING,
+                      BERT_INPUTS_DOCSTRING)
 class BertForNextSentencePrediction(BertPreTrainedModel):
     r"""
         **next_sentence_label**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -825,7 +830,7 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
         outputs = self.bert(input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
+                            position_ids=position_ids,
                             head_mask=head_mask)
 
         pooled_output = outputs[1]
@@ -842,8 +847,9 @@ class BertForNextSentencePrediction(BertPreTrainedModel):
 
 
 @add_start_docstrings("""Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+                      the pooled output) e.g. for GLUE tasks. """,
+                      BERT_START_DOCSTRING,
+                      BERT_INPUTS_DOCSTRING)
 class BertForSequenceClassification(BertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -891,7 +897,7 @@ class BertForSequenceClassification(BertPreTrainedModel):
         outputs = self.bert(input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
+                            position_ids=position_ids,
                             head_mask=head_mask)
 
         pooled_output = outputs[1]
@@ -915,8 +921,9 @@ class BertForSequenceClassification(BertPreTrainedModel):
 
 
 @add_start_docstrings("""Bert Model with a multiple choice classification head on top (a linear layer on top of
-    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+                      the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+                      BERT_START_DOCSTRING,
+                      BERT_INPUTS_DOCSTRING)
 class BertForMultipleChoice(BertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -990,8 +997,9 @@ class BertForMultipleChoice(BertPreTrainedModel):
 
 
 @add_start_docstrings("""Bert Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+                      the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+                      BERT_START_DOCSTRING,
+                      BERT_INPUTS_DOCSTRING)
 class BertForTokenClassification(BertPreTrainedModel):
     r"""
         **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
@@ -1037,7 +1045,7 @@ class BertForTokenClassification(BertPreTrainedModel):
         outputs = self.bert(input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
+                            position_ids=position_ids,
                             head_mask=head_mask)
 
         sequence_output = outputs[0]
@@ -1062,8 +1070,9 @@ class BertForTokenClassification(BertPreTrainedModel):
 
 
 @add_start_docstrings("""Bert Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    BERT_START_DOCSTRING, BERT_INPUTS_DOCSTRING)
+                      the hidden-states output to compute `span start logits` and `span end logits`). """,
+                      BERT_START_DOCSTRING,
+                      BERT_INPUTS_DOCSTRING)
 class BertForQuestionAnswering(BertPreTrainedModel):
     r"""
         **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
@@ -1116,7 +1125,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
         outputs = self.bert(input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
+                            position_ids=position_ids,
                             head_mask=head_mask)
 
         sequence_output = outputs[0]

From dda1adad6de0874e337ec04e52c4c291c0abfb59 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Mon, 7 Oct 2019 16:31:46 +0200
Subject: [PATCH 006/144] rename BertLayer to BertEncoderLayer

---
 transformers/modeling_bert.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 3187d1ca50..20b49c592f 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -315,9 +315,9 @@ class BertOutput(nn.Module):
         return hidden_states
 
 
-class BertLayer(nn.Module):
+class BertEncoderLayer(nn.Module):
     def __init__(self, config):
-        super(BertLayer, self).__init__()
+        super(BertEncoderLayer, self).__init__()
         self.attention = BertAttention(config)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
@@ -336,7 +336,7 @@ class BertEncoder(nn.Module):
         super(BertEncoder, self).__init__()
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
-        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([BertEncoderLayer(config) for _ in range(config.num_hidden_layers)])
 
     def forward(self, hidden_states, attention_mask=None, head_mask=None):
         all_hidden_states = ()

From 31adbb247c8c3ec248e30f89a3e4278622915ff3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Mon, 7 Oct 2019 16:43:21 +0200
Subject: [PATCH 007/144] add class wireframes for Bert decoder

---
 transformers/modeling_bert.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 20b49c592f..f2e2dba589 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -331,6 +331,14 @@ class BertEncoderLayer(nn.Module):
         return outputs
 
 
+class BertDecoderLayer(nn.Module):
+    def __init__(self, config):
+        raise NotImplementedError
+
+    def forward(self, hidden_state, encoder_output):
+        raise NotImplementedError
+
+
 class BertEncoder(nn.Module):
     def __init__(self, config):
         super(BertEncoder, self).__init__()
@@ -363,6 +371,14 @@ class BertEncoder(nn.Module):
         return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 
 
+class BertDecoder(nn.Module):
+    def __init__(self, config):
+        raise NotImplementedError
+
+    def forward(self, encoder_output):
+        raise NotImplementedError
+
+
 class BertPooler(nn.Module):
     def __init__(self, config):
         super(BertPooler, self).__init__()

From a0dcefa382a541d0fecd634d6d0c3f97cd221faf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Mon, 7 Oct 2019 17:53:58 +0200
Subject: [PATCH 008/144] generalize BertSelfAttention to take separate query,
 key, value

There is currently no way to specify the quey, key and value separately
in the Attention module. However, the decoder's "encoder-decoder
attention" layers take the decoder's last output as a query, the
encoder's states as key and value. We thus modify the existing code so
query, key and value can be added separately.

This obviously poses some naming conventions; `BertSelfAttention` is not
a self-attention module anymore. The way the residual is forwarded is
now awkard, etc. We will need to do some refacto once the decoder is
fully implemented.
---
 transformers/modeling_bert.py | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index f2e2dba589..8a2624f8f0 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -198,10 +198,10 @@ class BertSelfAttention(nn.Module):
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None):
-        mixed_query_layer = self.query(hidden_states)
-        mixed_key_layer = self.key(hidden_states)
-        mixed_value_layer = self.value(hidden_states)
+    def forward(self, query, key, value, attention_mask=None, head_mask=None):
+        mixed_query_layer = self.query(query)
+        mixed_key_layer = self.key(key)
+        mixed_value_layer = self.value(value)
 
         query_layer = self.transpose_for_scores(mixed_query_layer)
         key_layer = self.transpose_for_scores(mixed_key_layer)
@@ -279,9 +279,12 @@ class BertAttention(nn.Module):
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def forward(self, input_tensor, attention_mask=None, head_mask=None):
-        self_outputs = self.self(input_tensor, attention_mask, head_mask)
-        attention_output = self.output(self_outputs[0], input_tensor)
+    def forward(self, query_tensor, key_tensor, value_tensor, attention_mask=None, head_mask=None):
+        self_outputs = self.self(query_tensor, key_tensor, value_tensor, attention_mask, head_mask)
+        # in encoder-decoder attention we use the output of the previous decoder stage as the query
+        # in the Multi-Head Attention. We thus pass query_tensor as the residual in BertOutput.
+        # This shows the limits of the current code architecture, which may benefit from some refactoring.
+        attention_output = self.output(self_outputs[0], query_tensor)
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
@@ -323,7 +326,11 @@ class BertEncoderLayer(nn.Module):
         self.output = BertOutput(config)
 
     def forward(self, hidden_states, attention_mask=None, head_mask=None):
-        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
+        attention_outputs = self.attention(query_tensor=hidden_states,
+                                           key_tensor=hidden_states,
+                                           value_tensor=hidden_states,
+                                           attention_mask=attention_mask,
+                                           head_mask=head_mask)
         attention_output = attention_outputs[0]
         intermediate_output = self.intermediate(attention_output)
         layer_output = self.output(intermediate_output, attention_output)
@@ -333,6 +340,7 @@ class BertEncoderLayer(nn.Module):
 
 class BertDecoderLayer(nn.Module):
     def __init__(self, config):
+        super(BertDecoderLayer, self).__init__()
         raise NotImplementedError
 
     def forward(self, hidden_state, encoder_output):

From cd6a59d5c1cb9c7905675fc82ce50df5e2bdf3f0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Tue, 8 Oct 2019 10:11:02 +0200
Subject: [PATCH 009/144] add a decoder layer for Bert

---
 transformers/modeling_bert.py | 24 +++++++++++++++++++++---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 8a2624f8f0..4011da18b4 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -341,10 +341,28 @@ class BertEncoderLayer(nn.Module):
 class BertDecoderLayer(nn.Module):
     def __init__(self, config):
         super(BertDecoderLayer, self).__init__()
-        raise NotImplementedError
+        self.self_attention = BertAttention(config)
+        self.attention = BertAttention(config)
+        self.intermediate = BertIntermediate(config)
+        self.output = BertOutput(config)
 
-    def forward(self, hidden_state, encoder_output):
-        raise NotImplementedError
+    def forward(self, hidden_states, encoder_outputs, attention_mask=None, head_mask=None):
+        self_attention_outputs = self.self_attention(query_tensor=hidden_states,
+                                                     key_tensor=hidden_states,
+                                                     value_tensor=hidden_states,
+                                                     attention_mask=attention_mask,
+                                                     head_mask=head_mask)
+        self_attention_output = self_attention_outputs[0]
+        attention_outputs = self.attention(query_tensor=self_attention_output,
+                                           key_tensor=encoder_outputs,
+                                           value_tensor=encoder_outputs,
+                                           attention_mask=attention_mask,
+                                           head_mask=head_mask)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        outputs = (layer_output,) + attention_outputs[1:]
+        return outputs
 
 
 class BertEncoder(nn.Module):

From 15a2fc88a68741163cc9b798921e6b33ef32528a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Tue, 8 Oct 2019 11:10:35 +0200
Subject: [PATCH 010/144] add General attention classes

The modifications that I introduced in a previous commit did break
Bert's internal API. I reverted these changes and added more general
classes to handle the encoder-decoder attention case.

There may be a more elegant way to deal with retro-compatibility (I am
not comfortable with the current state of the code), but I cannot see it
right now.
---
 transformers/modeling_bert.py | 158 +++++++++++++++++++++++++++++-----
 1 file changed, 136 insertions(+), 22 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 4011da18b4..a5e36eaed0 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -174,9 +174,9 @@ class BertEmbeddings(nn.Module):
         return embeddings
 
 
-class BertSelfAttention(nn.Module):
+class BertGeneralAttention(nn.Module):
     def __init__(self, config):
-        super(BertSelfAttention, self).__init__()
+        super(BertGeneralAttention, self).__init__()
         if config.hidden_size % config.num_attention_heads != 0:
             raise ValueError(
                 "The hidden size (%d) is not a multiple of the number of attention "
@@ -235,6 +235,67 @@ class BertSelfAttention(nn.Module):
         return outputs
 
 
+class BertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(BertSelfAttention, self).__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention "
+                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
+        self.output_attentions = config.output_attentions
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_key_layer = self.key(hidden_states)
+        mixed_value_layer = self.value(hidden_states)
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+        key_layer = self.transpose_for_scores(mixed_key_layer)
+        value_layer = self.transpose_for_scores(mixed_value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
+        return outputs
+
+
 class BertSelfOutput(nn.Module):
     def __init__(self, config):
         super(BertSelfOutput, self).__init__()
@@ -279,12 +340,49 @@ class BertAttention(nn.Module):
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def forward(self, query_tensor, key_tensor, value_tensor, attention_mask=None, head_mask=None):
-        self_outputs = self.self(query_tensor, key_tensor, value_tensor, attention_mask, head_mask)
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        self_outputs = self.self(hidden_states, attention_mask, head_mask)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class BertDecoderAttention(nn.Module):
+    def __init__(self, config):
+        super(BertAttention, self).__init__()
+        self.self = BertGeneralAttention(config)
+        self.output = BertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
+        heads = set(heads) - self.pruned_heads  # Convert to set and emove already pruned heads
+        for head in heads:
+            # Compute how many pruned heads are before the head and move the index accordingly
+            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, query, key, value, attention_mask=None, head_mask=None):
+        self_outputs = self.self(query, key, value, attention_mask, head_mask)
         # in encoder-decoder attention we use the output of the previous decoder stage as the query
         # in the Multi-Head Attention. We thus pass query_tensor as the residual in BertOutput.
         # This shows the limits of the current code architecture, which may benefit from some refactoring.
-        attention_output = self.output(self_outputs[0], query_tensor)
+        attention_output = self.output(self_outputs[0], query)
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
@@ -326,11 +424,7 @@ class BertEncoderLayer(nn.Module):
         self.output = BertOutput(config)
 
     def forward(self, hidden_states, attention_mask=None, head_mask=None):
-        attention_outputs = self.attention(query_tensor=hidden_states,
-                                           key_tensor=hidden_states,
-                                           value_tensor=hidden_states,
-                                           attention_mask=attention_mask,
-                                           head_mask=head_mask)
+        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
         attention_output = attention_outputs[0]
         intermediate_output = self.intermediate(attention_output)
         layer_output = self.output(intermediate_output, attention_output)
@@ -342,20 +436,16 @@ class BertDecoderLayer(nn.Module):
     def __init__(self, config):
         super(BertDecoderLayer, self).__init__()
         self.self_attention = BertAttention(config)
-        self.attention = BertAttention(config)
+        self.attention = BertDecoderAttention(config)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
     def forward(self, hidden_states, encoder_outputs, attention_mask=None, head_mask=None):
-        self_attention_outputs = self.self_attention(query_tensor=hidden_states,
-                                                     key_tensor=hidden_states,
-                                                     value_tensor=hidden_states,
-                                                     attention_mask=attention_mask,
-                                                     head_mask=head_mask)
+        self_attention_outputs = self.self_attention(hidden_states, attention_mask, head_mask)
         self_attention_output = self_attention_outputs[0]
-        attention_outputs = self.attention(query_tensor=self_attention_output,
-                                           key_tensor=encoder_outputs,
-                                           value_tensor=encoder_outputs,
+        attention_outputs = self.attention(query=self_attention_output,
+                                           key=encoder_outputs,
+                                           value=encoder_outputs,
                                            attention_mask=attention_mask,
                                            head_mask=head_mask)
         attention_output = attention_outputs[0]
@@ -399,10 +489,34 @@ class BertEncoder(nn.Module):
 
 class BertDecoder(nn.Module):
     def __init__(self, config):
-        raise NotImplementedError
+        super(BertDecoder, self).__init__()
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.layers = nn.ModuleList([BertEncoderLayer(config) for _ in range(config.num_hidden_layers)])
 
-    def forward(self, encoder_output):
-        raise NotImplementedError
+    def forward(self, hidden_states, encoder_outputs, attention_mask=None, head_mask=None):
+        all_hidden_states = ()
+        all_attentions = ()
+        for i, layer_module in enumerate(self.layer):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i])
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+            hidden_states = layer_outputs[0]
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 
 
 class BertPooler(nn.Module):

From 75feacf172d5aa26314929c0f3bb54d9e845e00b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Tue, 8 Oct 2019 11:39:04 +0200
Subject: [PATCH 011/144] add general structure for Bert2Bert class

---
 transformers/modeling_bert.py | 60 +++++++++++++++++++++++++++++++++++
 1 file changed, 60 insertions(+)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index a5e36eaed0..1e228556b4 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -1310,3 +1310,63 @@ class BertForQuestionAnswering(BertPreTrainedModel):
             outputs = (total_loss,) + outputs
 
         return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings("Bert encoder-decoder model",
+                      BERT_START_DOCSTRING,
+                      BERT_INPUTS_DOCSTRING)
+class Bert2Bert(BertPreTrainedModel):
+    r"""
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+
+    Examples::
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = Bert2Bert.from_pretrained('bert-base-uncased')
+        input = tokenizer.encode("Hello, how are you?")
+        outputs = model(input)
+        output_text = tokenize.decode(outputs[0])
+        print(output_text)
+    """
+
+    def __init__(self, config):
+        super(Bert2Bert, self).__init__(config)
+        self.embeddings = BertEmbeddings(config)
+        self.encoder = BertEncoder(config)
+        self.decoder = BertDecoder(config)
+
+    def forward(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(inputs)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(inputs)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        embedding_output = self.embeddings(inputs, position_ids=position_ids, token_type_ids=token_type_ids)
+        encoder_outputs = self.encoder(embedding_output,
+                                       extended_attention_mask,
+                                       head_mask=head_mask)
+        decoder_outputs = self.decoder(embedding_output,
+                                       encoder_outputs[0],
+                                       extended_attention_mask,
+                                       head_mask=head_mask)
+        sequence_output = decoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)

From 070098309074e161ed1df35cf918013aeccba462 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Tue, 8 Oct 2019 15:39:47 +0200
Subject: [PATCH 012/144] Add BertDecoderModel and Bert2Bert classes

I am not sure what happens when the class is initialized with the
pretrained weights.
---
 transformers/modeling_bert.py | 169 +++++++++++++++++++++++++++-------
 1 file changed, 134 insertions(+), 35 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 1e228556b4..9ce32d808e 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -788,6 +788,110 @@ class BertModel(BertPreTrainedModel):
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
 
+@add_start_docstrings("""A bare Bert decoder Model transformer outputting raw hidden-states without any specific head on top.
+                      The model follows the general transformer decoder architecture.""",
+                      BERT_START_DOCSTRING,
+                      BERT_INPUTS_DOCSTRING)
+class BertDecoderModel(BertPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+        model = BertDecoderModel.from_pretrained('bert-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config):
+        super(BertModel, self).__init__(config)
+
+        self.embeddings = BertEmbeddings(config)
+        self.decoder = BertDecoder(config)
+        self.pooler = BertPooler(config)
+
+        self.init_weights()
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings = self.embeddings.word_embeddings
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.embeddings.word_embeddings = new_embeddings
+        return self.embeddings.word_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(self, input_ids, encoder_outputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype)  # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+        decoder_outputs = self.decoder(embedding_output,
+                                       encoder_outputs,
+                                       extended_attention_mask,
+                                       head_mask=head_mask)
+        sequence_output = decoder_outputs[0]
+        pooled_output = self.pooler(sequence_output)
+
+        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
+
+
 @add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
                        a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
                       BERT_START_DOCSTRING,
@@ -1312,13 +1416,20 @@ class BertForQuestionAnswering(BertPreTrainedModel):
         return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
 
 
-@add_start_docstrings("Bert encoder-decoder model",
+@add_start_docstrings("Bert encoder-decoder model for sequence generation.",
                       BERT_START_DOCSTRING,
                       BERT_INPUTS_DOCSTRING)
 class Bert2Bert(BertPreTrainedModel):
     r"""
 
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
 
     Examples::
 
@@ -1328,45 +1439,33 @@ class Bert2Bert(BertPreTrainedModel):
         outputs = model(input)
         output_text = tokenize.decode(outputs[0])
         print(output_text)
+
+    References::
+
+    [1] "Leveraging Pre-trained Checkpoints for Sequence Generation Tasks", S.Rothe, S.Narayan & A.Severyn (2019) ArXiV:1907.12461v1
+    [2] Tensor2Tensor library https://github.com/tensorflow/tensor2tensor
+
     """
 
     def __init__(self, config):
         super(Bert2Bert, self).__init__(config)
-        self.embeddings = BertEmbeddings(config)
-        self.encoder = BertEncoder(config)
-        self.decoder = BertDecoder(config)
+        self.encoder = BertModel(config)
+        self.decoder = BertDecoderModel(config)
 
-    def forward(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
-        if attention_mask is None:
-            attention_mask = torch.ones_like(inputs)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(inputs)
+        self.init_weights()
 
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        embedding_output = self.embeddings(inputs, position_ids=position_ids, token_type_ids=token_type_ids)
-        encoder_outputs = self.encoder(embedding_output,
-                                       extended_attention_mask,
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
+        encoder_outputs = self.encoder(input_ids,
+                                       attention_mask=attention_mask,
+                                       token_type_ids=token_type_ids,
+                                       position_ids=position_ids,
                                        head_mask=head_mask)
-        decoder_outputs = self.decoder(embedding_output,
-                                       encoder_outputs[0],
-                                       extended_attention_mask,
-                                       head_mask=head_mask)
-        sequence_output = decoder_outputs[0]
-        pooled_output = self.pooler(sequence_output)
+        encoder_output = encoder_outputs[0]
 
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
-        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
+        decoder_input = torch.empty_like(input_ids).normal_(mean=0.0, std=self.config.initializer_range)
+        decoder_outputs = self.decoder(decoder_input,
+                                       encoder_output,
+                                       token_type_ids=token_type_ids,
+                                       position_ids=position_ids,
+                                       head_mask=head_mask)
+        return decoder_outputs

From 82628b0fc921e5e3f250bcad10f2b3c54111c17f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Tue, 8 Oct 2019 15:57:25 +0200
Subject: [PATCH 013/144] add a placeholder test

---
 transformers/__init__.py                 |  2 +-
 transformers/tests/modeling_bert_test.py | 29 ++++++++++++++----------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/transformers/__init__.py b/transformers/__init__.py
index 5248bc9f1b..bf302992b2 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -64,7 +64,7 @@ if is_torch_available():
                                 BertForMaskedLM, BertForNextSentencePrediction,
                                 BertForSequenceClassification, BertForMultipleChoice,
                                 BertForTokenClassification, BertForQuestionAnswering,
-                                load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+                                load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, Bert2Bert)
     from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
                                 OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
                                 load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
index 633c97e263..2a2c3e50ea 100644
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -27,9 +27,9 @@ from .configuration_common_test import ConfigTester
 
 if is_torch_available():
     from transformers import (BertConfig, BertModel, BertForMaskedLM,
-                                        BertForNextSentencePrediction, BertForPreTraining,
-                                        BertForQuestionAnswering, BertForSequenceClassification,
-                                        BertForTokenClassification, BertForMultipleChoice)
+                              BertForNextSentencePrediction, BertForPreTraining,
+                              BertForQuestionAnswering, BertForSequenceClassification,
+                              BertForTokenClassification, BertForMultipleChoice, Bert2Bert)
     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 else:
     pytestmark = pytest.mark.skip("Require Torch")
@@ -38,8 +38,8 @@ else:
 class BertModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (BertModel, BertForMaskedLM, BertForNextSentencePrediction,
-            BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
-            BertForTokenClassification) if is_torch_available() else ()
+                         BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification,
+                         BertForTokenClassification) if is_torch_available() else ()
 
     class BertModelTester(object):
 
@@ -66,7 +66,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                      num_labels=3,
                      num_choices=4,
                      scope=None,
-                    ):
+                     ):
             self.parent = parent
             self.batch_size = batch_size
             self.seq_length = seq_length
@@ -145,7 +145,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 [self.batch_size, self.seq_length, self.hidden_size])
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
-
         def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForMaskedLM(config=config)
             model.eval()
@@ -172,7 +171,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 [self.batch_size, 2])
             self.check_loss_output(result)
 
-
         def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForPreTraining(config=config)
             model.eval()
@@ -191,7 +189,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 [self.batch_size, 2])
             self.check_loss_output(result)
 
-
         def create_and_check_bert_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForQuestionAnswering(config=config)
             model.eval()
@@ -210,7 +207,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 [self.batch_size, self.seq_length])
             self.check_loss_output(result)
 
-
         def create_and_check_bert_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = BertForSequenceClassification(config)
@@ -225,7 +221,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 [self.batch_size, self.num_labels])
             self.check_loss_output(result)
 
-
         def create_and_check_bert_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_labels = self.num_labels
             model = BertForTokenClassification(config=config)
@@ -240,7 +235,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 [self.batch_size, self.seq_length, self.num_labels])
             self.check_loss_output(result)
 
-
         def create_and_check_bert_for_multiple_choice(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_choices = self.num_choices
             model = BertForMultipleChoice(config=config)
@@ -261,6 +255,16 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 [self.batch_size, self.num_choices])
             self.check_loss_output(result)
 
+        def create_and_check_bert2bert(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_choices = self.num_choices
+            model = Bert2Bert(config=config)
+            model.eval()
+            bert2bert_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            bert2bert_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            bert2bert_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            _ = model(bert2bert_inputs_ids,
+                      attention_mask=bert2bert_input_mask,
+                      token_type_ids=bert2bert_token_type_ids)
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
@@ -316,5 +320,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)
 
+
 if __name__ == "__main__":
     unittest.main()

From 8abfee9ec327aea0005a7ad367639217ca7dd215 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Tue, 8 Oct 2019 16:07:25 +0200
Subject: [PATCH 014/144] rename Bert2Bert -> Bert2Rnd

---
 transformers/__init__.py                 | 2 +-
 transformers/modeling_bert.py            | 7 ++++---
 transformers/tests/modeling_bert_test.py | 4 ++--
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/transformers/__init__.py b/transformers/__init__.py
index bf302992b2..006ba9ed16 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -64,7 +64,7 @@ if is_torch_available():
                                 BertForMaskedLM, BertForNextSentencePrediction,
                                 BertForSequenceClassification, BertForMultipleChoice,
                                 BertForTokenClassification, BertForQuestionAnswering,
-                                load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, Bert2Bert)
+                                load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, Bert2Rnd)
     from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
                                 OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
                                 load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 9ce32d808e..258e4c3430 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -1419,7 +1419,7 @@ class BertForQuestionAnswering(BertPreTrainedModel):
 @add_start_docstrings("Bert encoder-decoder model for sequence generation.",
                       BERT_START_DOCSTRING,
                       BERT_INPUTS_DOCSTRING)
-class Bert2Bert(BertPreTrainedModel):
+class Bert2Rnd(BertPreTrainedModel):
     r"""
 
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -1434,7 +1434,8 @@ class Bert2Bert(BertPreTrainedModel):
     Examples::
 
         tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = Bert2Bert.from_pretrained('bert-base-uncased')
+        model = Bert2Rnd.from_pretrained('bert-base-uncased')
+        # fine-tuning magic happens here
         input = tokenizer.encode("Hello, how are you?")
         outputs = model(input)
         output_text = tokenize.decode(outputs[0])
@@ -1468,4 +1469,4 @@ class Bert2Bert(BertPreTrainedModel):
                                        token_type_ids=token_type_ids,
                                        position_ids=position_ids,
                                        head_mask=head_mask)
-        return decoder_outputs
+        return decoder_outputs[0]
diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
index 2a2c3e50ea..24acf565e3 100644
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -29,7 +29,7 @@ if is_torch_available():
     from transformers import (BertConfig, BertModel, BertForMaskedLM,
                               BertForNextSentencePrediction, BertForPreTraining,
                               BertForQuestionAnswering, BertForSequenceClassification,
-                              BertForTokenClassification, BertForMultipleChoice, Bert2Bert)
+                              BertForTokenClassification, BertForMultipleChoice, Bert2Rnd)
     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 else:
     pytestmark = pytest.mark.skip("Require Torch")
@@ -257,7 +257,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
 
         def create_and_check_bert2bert(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_choices = self.num_choices
-            model = Bert2Bert(config=config)
+            model = Bert2Rnd(config=config)
             model.eval()
             bert2bert_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
             bert2bert_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()

From 61ed8890052eb628fe969ed440a38ff82577595c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi@hugginface.co>
Date: Tue, 8 Oct 2019 16:30:07 +0200
Subject: [PATCH 015/144] remove old seq2seq file

---
 transformers/modeling_seq2seq.py            | 39 ---------------------
 transformers/tests/modeling_bert_test.py    |  2 +-
 transformers/tests/modeling_seq2seq_test.py | 23 ------------
 3 files changed, 1 insertion(+), 63 deletions(-)
 delete mode 100644 transformers/modeling_seq2seq.py
 delete mode 100644 transformers/tests/modeling_seq2seq_test.py

diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
deleted file mode 100644
index b14622e50f..0000000000
--- a/transformers/modeling_seq2seq.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Conditional generation class. """
-
-
-class Seq2SeqModel(object):
-    def __init__(self):
-        raise EnvironmentError(
-            """Seq2Seq is designed to be instantiated using the
-        `Seq2Seq.from_pretrained(encoder_name_or_path, decoder_name_or_path)` method."""
-        )
-
-    @classmethod
-    def from_pretrained(cls, encoder_name, decoder_name):
-        # Here we should call AutoModel to initialize the models depending
-        # on the pretrained models taken as an input.
-        # For a first iteration we only work with Bert.
-        raise NotImplementedError
-
-    def __call__(self):
-        # allows to call an instance of the class
-        # model = Seq2Seq(encode='bert', decoder='bert')
-        raise NotImplementedError
-
-    def process(self):
-        # alternative API to __call__ it is more explicit.
-        raise NotImplementedError
diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
index 24acf565e3..fe9e039983 100644
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -255,7 +255,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 [self.batch_size, self.num_choices])
             self.check_loss_output(result)
 
-        def create_and_check_bert2bert(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_bert2rnd(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             config.num_choices = self.num_choices
             model = Bert2Rnd(config=config)
             model.eval()
diff --git a/transformers/tests/modeling_seq2seq_test.py b/transformers/tests/modeling_seq2seq_test.py
deleted file mode 100644
index 1866dc10af..0000000000
--- a/transformers/tests/modeling_seq2seq_test.py
+++ /dev/null
@@ -1,23 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-
-class Seq2SeqTest(unittest.TestCase):
-    raise NotImplementedError
-
-
-def __main__():
-    unittest.main()

From 770b15b58ceb66a5da72d8030e9aff05fd50848a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Tue, 8 Oct 2019 17:32:28 +0200
Subject: [PATCH 016/144] rename class in __init__

---
 transformers/modeling_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 258e4c3430..fc698c772e 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -1449,7 +1449,7 @@ class Bert2Rnd(BertPreTrainedModel):
     """
 
     def __init__(self, config):
-        super(Bert2Bert, self).__init__(config)
+        super(Bert2Rnd, self).__init__(config)
         self.encoder = BertModel(config)
         self.decoder = BertDecoderModel(config)
 

From 851ef592c57bfb0af3807548e798570242c45510 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 10 Oct 2019 10:02:03 +0200
Subject: [PATCH 017/144] add comment on recursive weights loading

---
 transformers/modeling_utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 84b64e3ca4..ea114a76fd 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -383,6 +383,8 @@ class PreTrainedModel(nn.Module):
             if metadata is not None:
                 state_dict._metadata = metadata
 
+            # PyTorch's `_load_from_state_dict` does not copy parameters in a module's descendants
+            # so we need to apply the function recursively.
             def load(module, prefix=''):
                 local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
                 module._load_from_state_dict(

From 877ef2c6cae3059ff9307387baaed886139c5eff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 10 Oct 2019 10:02:18 +0200
Subject: [PATCH 018/144] override `from_pretrained` in Bert2Rnd

In the seq2seq model we need to both load pretrained weights in the
encoder and initialize the decoder randomly. Because the
`from_pretrained` method defined in the base class relies on module
names to assign weights, it would also initialize the decoder with
pretrained weights. To avoid this we override the method to only
initialize the encoder with pretrained weights.
---
 transformers/modeling_bert.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index fc698c772e..db8847f39e 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -1455,6 +1455,37 @@ class Bert2Rnd(BertPreTrainedModel):
 
         self.init_weights()
 
+    @classmethod
+    def from_pretrained(cls, pretrained_model_or_path, *model_args, **model_kwargs):
+        """ Load the pretrained weights in the encoder.
+
+        Since the decoder needs to be initialized with random weights, and the encoder with
+        pretrained weights we need to override the `from_pretrained` method of the base `PreTrainedModel`
+        class.
+        """
+        pretrained_encoder = BertModel.from_pretrained(pretrained_model_or_path, *model_args, **model_kwargs)
+
+        config = cls._load_config(pretrained_model_or_path, *model_args, **model_kwargs)
+        model = cls(config)
+        model.encoder = pretrained_encoder
+
+        return model
+
+    def _load_config(self, pretrained_model_name_or_path, *args, **kwargs):
+        config = kwargs.pop('config', None)
+        if config is None:
+            cache_dir = kwargs.pop('cache_dir', None)
+            force_download = kwargs.pop('force_download', False)
+            config, _ = self.config_class.from_pretrained(
+                pretrained_model_name_or_path,
+                *args,
+                cache_dir=cache_dir,
+                return_unused_kwargs=True,
+                force_download=force_download,
+                **kwargs
+            )
+        return config
+
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
         encoder_outputs = self.encoder(input_ids,
                                        attention_mask=attention_mask,

From 09cfd122353347da7a62eb4f5af75d83b955684f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 10 Oct 2019 10:15:27 +0200
Subject: [PATCH 019/144] remove  and do the branching in

---
 transformers/modeling_bert.py | 68 +++--------------------------------
 1 file changed, 5 insertions(+), 63 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index db8847f39e..94791571cd 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -174,67 +174,6 @@ class BertEmbeddings(nn.Module):
         return embeddings
 
 
-class BertGeneralAttention(nn.Module):
-    def __init__(self, config):
-        super(BertGeneralAttention, self).__init__()
-        if config.hidden_size % config.num_attention_heads != 0:
-            raise ValueError(
-                "The hidden size (%d) is not a multiple of the number of attention "
-                "heads (%d)" % (config.hidden_size, config.num_attention_heads))
-        self.output_attentions = config.output_attentions
-
-        self.num_attention_heads = config.num_attention_heads
-        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query = nn.Linear(config.hidden_size, self.all_head_size)
-        self.key = nn.Linear(config.hidden_size, self.all_head_size)
-        self.value = nn.Linear(config.hidden_size, self.all_head_size)
-
-        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
-
-    def transpose_for_scores(self, x):
-        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
-        x = x.view(*new_x_shape)
-        return x.permute(0, 2, 1, 3)
-
-    def forward(self, query, key, value, attention_mask=None, head_mask=None):
-        mixed_query_layer = self.query(query)
-        mixed_key_layer = self.key(key)
-        mixed_value_layer = self.value(value)
-
-        query_layer = self.transpose_for_scores(mixed_query_layer)
-        key_layer = self.transpose_for_scores(mixed_key_layer)
-        value_layer = self.transpose_for_scores(mixed_value_layer)
-
-        # Take the dot product between "query" and "key" to get the raw attention scores.
-        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
-        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
-        if attention_mask is not None:
-            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
-            attention_scores = attention_scores + attention_mask
-
-        # Normalize the attention scores to probabilities.
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-
-        # This is actually dropping out entire tokens to attend to, which might
-        # seem a bit unusual, but is taken from the original Transformer paper.
-        attention_probs = self.dropout(attention_probs)
-
-        # Mask heads if we want to
-        if head_mask is not None:
-            attention_probs = attention_probs * head_mask
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-
-        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
-        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
-        return outputs
-
-
 class BertSelfAttention(nn.Module):
     def __init__(self, config):
         super(BertSelfAttention, self).__init__()
@@ -259,10 +198,13 @@ class BertSelfAttention(nn.Module):
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None):
-        mixed_query_layer = self.query(hidden_states)
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None):
         mixed_key_layer = self.key(hidden_states)
         mixed_value_layer = self.value(hidden_states)
+        if encoder_hidden_states:  # if encoder-decoder attention
+            mixed_query_layer = self.query(encoder_hidden_states)
+        else:
+            mixed_query_layer = self.query(hidden_states)
 
         query_layer = self.transpose_for_scores(mixed_query_layer)
         key_layer = self.transpose_for_scores(mixed_key_layer)

From edfc8f822557f3df7d9057a6457a933cddf15299 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 10 Oct 2019 10:17:27 +0200
Subject: [PATCH 020/144] Remove  and do the branching in

---
 transformers/modeling_bert.py | 44 ++---------------------------------
 1 file changed, 2 insertions(+), 42 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 94791571cd..89407ff8ab 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -282,53 +282,13 @@ class BertAttention(nn.Module):
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None):
-        self_outputs = self.self(hidden_states, attention_mask, head_mask)
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None):
+        self_outputs = self.self(hidden_states, attention_mask, head_mask, encoder_hidden_states)
         attention_output = self.output(self_outputs[0], hidden_states)
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
 
 
-class BertDecoderAttention(nn.Module):
-    def __init__(self, config):
-        super(BertAttention, self).__init__()
-        self.self = BertGeneralAttention(config)
-        self.output = BertSelfOutput(config)
-        self.pruned_heads = set()
-
-    def prune_heads(self, heads):
-        if len(heads) == 0:
-            return
-        mask = torch.ones(self.self.num_attention_heads, self.self.attention_head_size)
-        heads = set(heads) - self.pruned_heads  # Convert to set and emove already pruned heads
-        for head in heads:
-            # Compute how many pruned heads are before the head and move the index accordingly
-            head = head - sum(1 if h < head else 0 for h in self.pruned_heads)
-            mask[head] = 0
-        mask = mask.view(-1).contiguous().eq(1)
-        index = torch.arange(len(mask))[mask].long()
-
-        # Prune linear layers
-        self.self.query = prune_linear_layer(self.self.query, index)
-        self.self.key = prune_linear_layer(self.self.key, index)
-        self.self.value = prune_linear_layer(self.self.value, index)
-        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
-
-        # Update hyper params and store pruned heads
-        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
-        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
-        self.pruned_heads = self.pruned_heads.union(heads)
-
-    def forward(self, query, key, value, attention_mask=None, head_mask=None):
-        self_outputs = self.self(query, key, value, attention_mask, head_mask)
-        # in encoder-decoder attention we use the output of the previous decoder stage as the query
-        # in the Multi-Head Attention. We thus pass query_tensor as the residual in BertOutput.
-        # This shows the limits of the current code architecture, which may benefit from some refactoring.
-        attention_output = self.output(self_outputs[0], query)
-        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
-        return outputs
-
-
 class BertIntermediate(nn.Module):
     def __init__(self, config):
         super(BertIntermediate, self).__init__()

From 9ca788b2e8f02ea08796e66628b1fd176245f896 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 10 Oct 2019 11:33:28 +0200
Subject: [PATCH 021/144] merge the two Bert layers classes

---
 transformers/modeling_bert.py | 54 +++++++++++++++--------------------
 1 file changed, 23 insertions(+), 31 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 89407ff8ab..f982364f5e 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -318,15 +318,26 @@ class BertOutput(nn.Module):
         return hidden_states
 
 
-class BertEncoderLayer(nn.Module):
+class BertLayer(nn.Module):
     def __init__(self, config):
-        super(BertEncoderLayer, self).__init__()
-        self.attention = BertAttention(config)
+        super(BertLayer, self).__init__()
+        self.self_attention = BertAttention(config)
+        if config.get('is_decoder', False):
+            self.attention = BertAttention(config)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None):
-        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_state=None):
+        self_attention_outputs = self.self_attention(hidden_states, attention_mask, head_mask)
+        self_attention_output = self_attention_outputs[0]
+
+        attention_outputs = self_attention_outputs
+        if encoder_hidden_state:
+            try:
+                attention_outputs = self.attention(self_attention_output, attention_mask, head_mask, encoder_hidden_state)
+            except AttributeError as ae:
+                raise ae("you need to set `is_encoder` to True in the configuration to instantiate an encoder layer")
+
         attention_output = attention_outputs[0]
         intermediate_output = self.intermediate(attention_output)
         layer_output = self.output(intermediate_output, attention_output)
@@ -334,35 +345,12 @@ class BertEncoderLayer(nn.Module):
         return outputs
 
 
-class BertDecoderLayer(nn.Module):
-    def __init__(self, config):
-        super(BertDecoderLayer, self).__init__()
-        self.self_attention = BertAttention(config)
-        self.attention = BertDecoderAttention(config)
-        self.intermediate = BertIntermediate(config)
-        self.output = BertOutput(config)
-
-    def forward(self, hidden_states, encoder_outputs, attention_mask=None, head_mask=None):
-        self_attention_outputs = self.self_attention(hidden_states, attention_mask, head_mask)
-        self_attention_output = self_attention_outputs[0]
-        attention_outputs = self.attention(query=self_attention_output,
-                                           key=encoder_outputs,
-                                           value=encoder_outputs,
-                                           attention_mask=attention_mask,
-                                           head_mask=head_mask)
-        attention_output = attention_outputs[0]
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        outputs = (layer_output,) + attention_outputs[1:]
-        return outputs
-
-
 class BertEncoder(nn.Module):
     def __init__(self, config):
         super(BertEncoder, self).__init__()
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
-        self.layer = nn.ModuleList([BertEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
 
     def forward(self, hidden_states, attention_mask=None, head_mask=None):
         all_hidden_states = ()
@@ -392,9 +380,10 @@ class BertEncoder(nn.Module):
 class BertDecoder(nn.Module):
     def __init__(self, config):
         super(BertDecoder, self).__init__()
+        config["is_decoder"] = True
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
-        self.layers = nn.ModuleList([BertEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layers = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
 
     def forward(self, hidden_states, encoder_outputs, attention_mask=None, head_mask=None):
         all_hidden_states = ()
@@ -403,7 +392,10 @@ class BertDecoder(nn.Module):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
-            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i])
+            layer_outputs = layer_module(hidden_states,
+                                         attention_mask=attention_mask,
+                                         head_mask=head_mask[i],
+                                         encoder_hidden_state=encoder_outputs)
             if self.output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
 

From df85a0ff0b2847295fde06ab5fd6d2bcb217d59e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 10 Oct 2019 11:38:26 +0200
Subject: [PATCH 022/144] replace double quotes with simple quotes

---
 transformers/modeling_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index f982364f5e..a5b21510aa 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -322,7 +322,7 @@ class BertLayer(nn.Module):
     def __init__(self, config):
         super(BertLayer, self).__init__()
         self.self_attention = BertAttention(config)
-        if config.get('is_decoder', False):
+        if config.get("is_decoder", False):
             self.attention = BertAttention(config)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)

From 17177e73796f516e3f49d311eab77b02ab679871 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 10 Oct 2019 12:03:58 +0200
Subject: [PATCH 023/144] add is_decoder as an attribute to Config class

---
 transformers/modeling_bert.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index a5b21510aa..9e03c2f8d4 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -322,7 +322,7 @@ class BertLayer(nn.Module):
     def __init__(self, config):
         super(BertLayer, self).__init__()
         self.self_attention = BertAttention(config)
-        if config.get("is_decoder", False):
+        if getattr(config, "is_decoder", False):
             self.attention = BertAttention(config)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
@@ -380,7 +380,7 @@ class BertEncoder(nn.Module):
 class BertDecoder(nn.Module):
     def __init__(self, config):
         super(BertDecoder, self).__init__()
-        config["is_decoder"] = True
+        config.is_decoder = True
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
         self.layers = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])

From 51261167b4a1de53cd38cc2b1553e5d71ba360ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 10 Oct 2019 12:17:22 +0200
Subject: [PATCH 024/144] prune both attention and self-attention heads

---
 transformers/modeling_bert.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 9e03c2f8d4..fddf5d52a2 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -633,7 +633,7 @@ class BertModel(BertPreTrainedModel):
             See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
+            self.encoder.layer[layer].self_attention.prune_heads(heads)
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
         if attention_mask is None:
@@ -736,7 +736,8 @@ class BertDecoderModel(BertPreTrainedModel):
             See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].attention.prune_heads(heads)
+            self.decoder.layer[layer].attention.prune_heads(heads)
+            self.decoder.layer[layer].self_attention.prune_heads(heads)
 
     def forward(self, input_ids, encoder_outputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
         if attention_mask is None:

From d7092d592ca55391b3c07505539b9e4c71bf79de Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 10 Oct 2019 12:51:14 +0200
Subject: [PATCH 025/144] rename the attributes in the Bert Layer

Since the preloading of weights relies on the name of the class's
attributes changing the namespace breaks loading pretrained weights on
Bert and all related models. I reverted `self_attention` to `attention`
and us `crossattention` for the decoder instead.
---
 transformers/modeling_bert.py | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index fddf5d52a2..5fcf41a1e1 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -321,25 +321,24 @@ class BertOutput(nn.Module):
 class BertLayer(nn.Module):
     def __init__(self, config):
         super(BertLayer, self).__init__()
-        self.self_attention = BertAttention(config)
+        self.attention = BertAttention(config)
         if getattr(config, "is_decoder", False):
-            self.attention = BertAttention(config)
+            self.crossattention = BertAttention(config)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
     def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_state=None):
-        self_attention_outputs = self.self_attention(hidden_states, attention_mask, head_mask)
-        self_attention_output = self_attention_outputs[0]
+        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
+        attention_output = attention_outputs[0]
 
-        attention_outputs = self_attention_outputs
         if encoder_hidden_state:
             try:
-                attention_outputs = self.attention(self_attention_output, attention_mask, head_mask, encoder_hidden_state)
+                crossattention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_state)
             except AttributeError as ae:
                 raise ae("you need to set `is_encoder` to True in the configuration to instantiate an encoder layer")
 
-        attention_output = attention_outputs[0]
-        intermediate_output = self.intermediate(attention_output)
+        crossattention_output = crossattention_outputs[0]
+        intermediate_output = self.intermediate(crossattention_output)
         layer_output = self.output(intermediate_output, attention_output)
         outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
         return outputs
@@ -633,7 +632,7 @@ class BertModel(BertPreTrainedModel):
             See base class PreTrainedModel
         """
         for layer, heads in heads_to_prune.items():
-            self.encoder.layer[layer].self_attention.prune_heads(heads)
+            self.encoder.layer[layer].attention.prune_heads(heads)
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
         if attention_mask is None:
@@ -737,7 +736,7 @@ class BertDecoderModel(BertPreTrainedModel):
         """
         for layer, heads in heads_to_prune.items():
             self.decoder.layer[layer].attention.prune_heads(heads)
-            self.decoder.layer[layer].self_attention.prune_heads(heads)
+            self.decoder.layer[layer].crossattention.prune_heads(heads)
 
     def forward(self, input_ids, encoder_outputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
         if attention_mask is None:

From 81ee29ee8d64c292c3fd5fc7e13b387acd1bfc39 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 10 Oct 2019 14:13:37 +0200
Subject: [PATCH 026/144] remove the staticmethod used to load the config

---
 transformers/modeling_bert.py | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 5fcf41a1e1..6dae6d6ce5 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -715,7 +715,7 @@ class BertDecoderModel(BertPreTrainedModel):
 
     """
     def __init__(self, config):
-        super(BertModel, self).__init__(config)
+        super(BertDecoderModel, self).__init__(config)
 
         self.embeddings = BertEmbeddings(config)
         self.decoder = BertDecoder(config)
@@ -1357,28 +1357,27 @@ class Bert2Rnd(BertPreTrainedModel):
         pretrained weights we need to override the `from_pretrained` method of the base `PreTrainedModel`
         class.
         """
-        pretrained_encoder = BertModel.from_pretrained(pretrained_model_or_path, *model_args, **model_kwargs)
 
-        config = cls._load_config(pretrained_model_or_path, *model_args, **model_kwargs)
-        model = cls(config)
-        model.encoder = pretrained_encoder
-
-        return model
-
-    def _load_config(self, pretrained_model_name_or_path, *args, **kwargs):
-        config = kwargs.pop('config', None)
+        # Load the configuration
+        config = model_kwargs.pop('config', None)
         if config is None:
-            cache_dir = kwargs.pop('cache_dir', None)
-            force_download = kwargs.pop('force_download', False)
-            config, _ = self.config_class.from_pretrained(
-                pretrained_model_name_or_path,
-                *args,
+            cache_dir = model_kwargs.pop('cache_dir', None)
+            force_download = model_kwargs.pop('force_download', False)
+            config, _ = cls.config_class.from_pretrained(
+                pretrained_model_or_path,
+                *model_args,
                 cache_dir=cache_dir,
                 return_unused_kwargs=True,
                 force_download=force_download,
-                **kwargs
+                **model_kwargs
             )
-        return config
+        model = cls(config)
+
+        # The encoder is loaded with pretrained weights
+        pretrained_encoder = BertModel.from_pretrained(pretrained_model_or_path, *model_args, **model_kwargs)
+        model.encoder = pretrained_encoder
+
+        return model
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
         encoder_outputs = self.encoder(input_ids,

From 3e1cd8241eddc7f3ec036c26f1cbbd3272088653 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 10 Oct 2019 14:18:20 +0200
Subject: [PATCH 027/144] fix stupid (re)naming issue

---
 transformers/modeling_bert.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 6dae6d6ce5..5d53b981e5 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -333,12 +333,12 @@ class BertLayer(nn.Module):
 
         if encoder_hidden_state:
             try:
-                crossattention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_state)
+                attention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_state)
             except AttributeError as ae:
                 raise ae("you need to set `is_encoder` to True in the configuration to instantiate an encoder layer")
 
-        crossattention_output = crossattention_outputs[0]
-        intermediate_output = self.intermediate(crossattention_output)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
         layer_output = self.output(intermediate_output, attention_output)
         outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
         return outputs

From fa218e648abc4f2c2d8a897ed0b4f2f050ecaca4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 10 Oct 2019 15:16:07 +0200
Subject: [PATCH 028/144] fix syntax errors

---
 transformers/modeling_bert.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 5d53b981e5..bce7972315 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -201,7 +201,7 @@ class BertSelfAttention(nn.Module):
     def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None):
         mixed_key_layer = self.key(hidden_states)
         mixed_value_layer = self.value(hidden_states)
-        if encoder_hidden_states:  # if encoder-decoder attention
+        if encoder_hidden_states is not None:  # if encoder-decoder attention
             mixed_query_layer = self.query(encoder_hidden_states)
         else:
             mixed_query_layer = self.query(hidden_states)
@@ -331,11 +331,12 @@ class BertLayer(nn.Module):
         attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
         attention_output = attention_outputs[0]
 
-        if encoder_hidden_state:
+        if encoder_hidden_state is not None:
             try:
                 attention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_state)
             except AttributeError as ae:
-                raise ae("you need to set `is_encoder` to True in the configuration to instantiate an encoder layer")
+                print("You need to set `is_encoder` to True in the configuration to instantiate an encoder layer:", ae)
+                raise
 
         attention_output = attention_outputs[0]
         intermediate_output = self.intermediate(attention_output)
@@ -382,7 +383,7 @@ class BertDecoder(nn.Module):
         config.is_decoder = True
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
-        self.layers = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
 
     def forward(self, hidden_states, encoder_outputs, attention_mask=None, head_mask=None):
         all_hidden_states = ()
@@ -738,7 +739,7 @@ class BertDecoderModel(BertPreTrainedModel):
             self.decoder.layer[layer].attention.prune_heads(heads)
             self.decoder.layer[layer].crossattention.prune_heads(heads)
 
-    def forward(self, input_ids, encoder_outputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
+    def forward(self, input_ids, encoder_outputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
         if token_type_ids is None:
@@ -782,7 +783,7 @@ class BertDecoderModel(BertPreTrainedModel):
         sequence_output = decoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
 
-        outputs = (sequence_output, pooled_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        outputs = (sequence_output, pooled_output,) + decoder_outputs[1:]  # add hidden_states and attentions if they are here
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
 
@@ -1387,8 +1388,7 @@ class Bert2Rnd(BertPreTrainedModel):
                                        head_mask=head_mask)
         encoder_output = encoder_outputs[0]
 
-        decoder_input = torch.empty_like(input_ids).normal_(mean=0.0, std=self.config.initializer_range)
-        decoder_outputs = self.decoder(decoder_input,
+        decoder_outputs = self.decoder(input_ids,
                                        encoder_output,
                                        token_type_ids=token_type_ids,
                                        position_ids=position_ids,

From 1e68c28670cc8d0e8d20ca9fadc697f03908015b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 10 Oct 2019 18:07:11 +0200
Subject: [PATCH 029/144] add test for initialization of Bert2Rnd

---
 examples/run_summarization.py            | 49 ++++++++++++++++++++++++
 transformers/tests/modeling_bert_test.py | 12 +++---
 2 files changed, 55 insertions(+), 6 deletions(-)
 create mode 100644 examples/run_summarization.py

diff --git a/examples/run_summarization.py b/examples/run_summarization.py
new file mode 100644
index 0000000000..0a367551d6
--- /dev/null
+++ b/examples/run_summarization.py
@@ -0,0 +1,49 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning seq2seq models for abstractive summarization.
+
+The finetuning method for abstractive summarization is inspired by [1]. We
+concatenate the document and summary, mask words of the summary at random and
+maximizing the likelihood of masked words.
+
+[1] Dong Li, Nan Yang, Wenhui Wang, Furu Wei, Xiaodong Liu, Yu Wang, Jianfeng
+Gao, Ming Zhou, and Hsiao-Wuen Hon.  “Unified Language Model Pre-Training for
+Natural Language Understanding and Generation.” (May 2019) ArXiv:1905.03197
+"""
+
+import logging
+import random
+
+import numpy as np
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def train(args, train_dataset, model, tokenizer):
+    raise NotImplementedError
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    raise NotImplementedError
diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
index fe9e039983..e649cd8ce8 100644
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -259,12 +259,12 @@ class BertModelTest(CommonTestCases.CommonModelTester):
             config.num_choices = self.num_choices
             model = Bert2Rnd(config=config)
             model.eval()
-            bert2bert_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            bert2bert_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            bert2bert_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            _ = model(bert2bert_inputs_ids,
-                      attention_mask=bert2bert_input_mask,
-                      token_type_ids=bert2bert_token_type_ids)
+            bert2rnd_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            bert2rnd_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            bert2rnd_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
+            _ = model(bert2rnd_inputs_ids,
+                      attention_mask=bert2rnd_input_mask,
+                      token_type_ids=bert2rnd_token_type_ids)
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()

From f8e98d67793341f8955634c942af1af579f097dd Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Fri, 11 Oct 2019 16:48:11 +0200
Subject: [PATCH 030/144] load pretrained embeddings in Bert decoder

In Rothe et al.'s "Leveraging Pre-trained Checkpoints for Sequence
Generation Tasks", Bert2Bert is initialized with pre-trained weights for
the encoder, and only pre-trained embeddings for the decoder. The
current version of the code completely randomizes the weights of the
decoder.

We write a custom function to initiliaze the weights of the decoder; we
first initialize the decoder with the weights and then randomize
everything but the embeddings.
---
 transformers/modeling_bert.py | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index bce7972315..03559ad26c 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -1348,15 +1348,14 @@ class Bert2Rnd(BertPreTrainedModel):
         self.encoder = BertModel(config)
         self.decoder = BertDecoderModel(config)
 
-        self.init_weights()
-
     @classmethod
     def from_pretrained(cls, pretrained_model_or_path, *model_args, **model_kwargs):
         """ Load the pretrained weights in the encoder.
 
-        Since the decoder needs to be initialized with random weights, and the encoder with
-        pretrained weights we need to override the `from_pretrained` method of the base `PreTrainedModel`
-        class.
+        The encoder of `Bert2Rand` is initialized with pretrained weights; the
+        weights of the decoder are initialized at random except the embeddings
+        which are initialized with the pretrained embeddings. We thus need to override
+        the base class' `from_pretrained` method.
         """
 
         # Load the configuration
@@ -1374,10 +1373,26 @@ class Bert2Rnd(BertPreTrainedModel):
             )
         model = cls(config)
 
-        # The encoder is loaded with pretrained weights
+        # We load the encoder with pretrained weights
         pretrained_encoder = BertModel.from_pretrained(pretrained_model_or_path, *model_args, **model_kwargs)
         model.encoder = pretrained_encoder
 
+        # We load the decoder with pretrained weights and then randomize all weights but embeddings-related one.
+        def randomize_decoder_weights(module):
+            if isinstance(module, nn.Linear):
+                # Slightly different from the TF version which uses truncated_normal for initialization
+                # cf https://github.com/pytorch/pytorch/pull/5617
+                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
+            elif isinstance(module, BertLayerNorm):
+                module.bias.data.zero_()
+                module.weight.data.fill_(1.0)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+
+        pretrained_decoder = BertDecoderModel.from_pretrained(pretrained_model_or_path, *model_args, **model_kwargs)
+        pretrained_decoder.apply(randomize_decoder_weights)
+        model.decoder = pretrained_decoder
+
         return model
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
@@ -1386,11 +1401,9 @@ class Bert2Rnd(BertPreTrainedModel):
                                        token_type_ids=token_type_ids,
                                        position_ids=position_ids,
                                        head_mask=head_mask)
-        encoder_output = encoder_outputs[0]
-
         decoder_outputs = self.decoder(input_ids,
-                                       encoder_output,
+                                       encoder_outputs[0],
                                        token_type_ids=token_type_ids,
                                        position_ids=position_ids,
                                        head_mask=head_mask)
-        return decoder_outputs[0]
+        return decoder_outputs

From d889e0b71beb12511b7fcc346113035e0115ef0c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Fri, 11 Oct 2019 17:36:12 +0200
Subject: [PATCH 031/144] add base for seq2seq finetuning

---
 examples/run_seq2seq_finetuning.py | 67 ++++++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 examples/run_seq2seq_finetuning.py

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
new file mode 100644
index 0000000000..f318bf8036
--- /dev/null
+++ b/examples/run_seq2seq_finetuning.py
@@ -0,0 +1,67 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018 Microsoft and The HuggingFace Inc.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning seq2seq models for sequence generation.
+
+We use the procedure described in [1] to finetune models for sequence
+generation. Let S1 and S2 be the source and target sequence respectively; we
+pack them using the start of sequence [SOS] and end of sequence [EOS] token:
+
+    [SOS] S1 [EOS] S2 [EOS]
+
+We then mask a fixed percentage of token from S2 at random and learn to predict
+the masked words. [EOS] can be masked during finetuning so the model learns to
+terminate the generation process.
+
+[1] Dong Li, Nan Yang, Wenhui Wang, Furu Wei, Xiaodong Liu, Yu Wang, Jianfeng
+Gao, Ming Zhou, and Hsiao-Wuen Hon.  “Unified Language Model Pre-Training for
+Natural Language Understanding and Generation.” (May 2019) ArXiv:1905.03197
+"""
+
+import logging
+import random
+
+import numpy as np
+import torch
+
+logger = logging.getLogger(__name__)
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def train(args, train_dataset, model, tokenizer):
+    """ Fine-tune the pretrained model on the corpus. """
+    # Data sampler
+    # Data loader
+    # Training
+    raise NotImplementedError
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    raise NotImplementedError
+
+
+def main():
+    raise NotImplementedError
+
+
+def __main__():
+    main()

From b3261e7ace153a78c19e35bba367e28e9ccdd2fa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Fri, 11 Oct 2019 18:40:38 +0200
Subject: [PATCH 032/144] read parameters from CLI, load model & tokenizer

---
 examples/run_seq2seq_finetuning.py | 60 ++++++++++++++++++++++++------
 examples/run_summarization.py      | 49 ------------------------
 2 files changed, 49 insertions(+), 60 deletions(-)
 delete mode 100644 examples/run_summarization.py

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index f318bf8036..7ad8e4df90 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -30,12 +30,15 @@ Gao, Ming Zhou, and Hsiao-Wuen Hon.  “Unified Language Model Pre-Training for
 Natural Language Understanding and Generation.” (May 2019) ArXiv:1905.03197
 """
 
+import argparse
 import logging
 import random
 
 import numpy as np
 import torch
 
+from transformers import BertConfig, Bert2Rnd, BertTokenizer
+
 logger = logging.getLogger(__name__)
 
 
@@ -43,25 +46,60 @@ def set_seed(args):
     random.seed(args.seed)
     np.random.seed(args.seed)
     torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
+
+
+def load_and_cache_examples(args, tokenizer):
+    raise NotImplementedError
 
 
 def train(args, train_dataset, model, tokenizer):
     """ Fine-tune the pretrained model on the corpus. """
-    # Data sampler
-    # Data loader
-    # Training
-    raise NotImplementedError
-
-
-def evaluate(args, model, tokenizer, prefix=""):
     raise NotImplementedError
 
 
 def main():
-    raise NotImplementedError
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument("--train_data_file",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The input training data file (a text file).")
+    parser.add_argument("--output_dir",
+                        default=None,
+                        type=str,
+                        required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+
+    # Optional parameters
+    parser.add_argument("--model_name_or_path",
+                        default="bert-base-cased",
+                        type=str,
+                        help="The model checkpoint for weights initialization.")
+    parser.add_argument("--seed", default=42, type=int)
+    args = parser.parse_args()
+
+    # Set up training device
+    device = torch.device("cpu")
+
+    # Set seed
+    set_seed(args)
+
+    # Load pretrained model and tokenizer
+    config_class, model_class, tokenizer_class = BertConfig, Bert2Rnd, BertTokenizer
+    config = config_class.from_pretrained(args.model_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
+    model = model_class.from_pretrained(args.model_name_or_path, config=config)
+    model.to(device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
+    train_dataset = load_and_cache_examples(args, tokenizer)
+    global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
 
-def __main__():
+if __name__ == "__main__":
     main()
diff --git a/examples/run_summarization.py b/examples/run_summarization.py
deleted file mode 100644
index 0a367551d6..0000000000
--- a/examples/run_summarization.py
+++ /dev/null
@@ -1,49 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
-# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning seq2seq models for abstractive summarization.
-
-The finetuning method for abstractive summarization is inspired by [1]. We
-concatenate the document and summary, mask words of the summary at random and
-maximizing the likelihood of masked words.
-
-[1] Dong Li, Nan Yang, Wenhui Wang, Furu Wei, Xiaodong Liu, Yu Wang, Jianfeng
-Gao, Ming Zhou, and Hsiao-Wuen Hon.  “Unified Language Model Pre-Training for
-Natural Language Understanding and Generation.” (May 2019) ArXiv:1905.03197
-"""
-
-import logging
-import random
-
-import numpy as np
-import torch
-
-logger = logging.getLogger(__name__)
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-    if args.n_gpu > 0:
-        torch.cuda.manual_seed_all(args.seed)
-
-
-def train(args, train_dataset, model, tokenizer):
-    raise NotImplementedError
-
-
-def evaluate(args, model, tokenizer, prefix=""):
-    raise NotImplementedError

From 5a8c6e771a2f086a06697900d7ba6249c3833556 Mon Sep 17 00:00:00 2001
From: Emrah Budur <emrah.budur@yahoo.com>
Date: Sat, 12 Oct 2019 14:17:17 +0300
Subject: [PATCH 033/144] Fixed the sample code in the title 'Quick tour'.

---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0cc23c8389..e44ff52099 100644
--- a/README.md
+++ b/README.md
@@ -176,10 +176,11 @@ BERT_MODEL_CLASSES = [BertModel, BertForPreTraining, BertForMaskedLM, BertForNex
 # All the classes for an architecture can be initiated from pretrained weights for this architecture
 # Note that additional weights added for fine-tuning are only initialized
 # and need to be trained on the down-stream task
-tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
+pretrained_weights = 'bert-base-uncased'
+tokenizer = BertTokenizer.from_pretrained(pretrained_weights)
 for model_class in BERT_MODEL_CLASSES:
     # Load pretrained model/tokenizer
-    model = model_class.from_pretrained('bert-base-uncased')
+    model = model_class.from_pretrained(pretrained_weights)
 
     # Models can return full list of hidden-states & attentions weights at each layer
     model = model_class.from_pretrained(pretrained_weights,

From 86f23a19445a920619fceaf60a6ea6a94f253c48 Mon Sep 17 00:00:00 2001
From: Timothy Liu <timothyl@nvidia.com>
Date: Sun, 13 Oct 2019 10:21:35 +0000
Subject: [PATCH 034/144] Minor enhancements to run_tf_glue.py

---
 examples/run_tf_glue.py | 41 ++++++++++++++++++++++++++++-------------
 1 file changed, 28 insertions(+), 13 deletions(-)

diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py
index f2e94ae39e..c05420d680 100644
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -1,40 +1,55 @@
+import os
 import tensorflow as tf
 import tensorflow_datasets
 from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features, BertForSequenceClassification
 
-# Load dataset, tokenizer, model from pretrained model/vocabulary
+# script parameters
+BATCH_SIZE = 32
+EVAL_BATCH_SIZE = BATCH_SIZE * 2
+
+# Load tokenizer and model from pretrained model/vocabulary
 tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
 model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
-data = tensorflow_datasets.load('glue/mrpc')
+
+# Load dataset via TensorFlow Datasets
+data, info = tensorflow_datasets.load('glue/mrpc', with_info=True)
+train_examples = info.splits['train'].num_examples
+valid_examples = info.splits['validation'].num_examples
 
 # Prepare dataset for GLUE as a tf.data.Dataset instance
 train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
 valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
-train_dataset = train_dataset.shuffle(100).batch(32).repeat(2)
-valid_dataset = valid_dataset.batch(64)
+train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
+valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
 
 # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
-optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
+optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
 loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
 model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
 
 # Train and evaluate using tf.keras.Model.fit()
-history = model.fit(train_dataset, epochs=2, steps_per_epoch=115,
-                    validation_data=valid_dataset, validation_steps=7)
+train_steps = train_examples//BATCH_SIZE
+valid_steps = valid_examples//EVAL_BATCH_SIZE
+
+history = model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps,
+                    validation_data=valid_dataset, validation_steps=valid_steps)
+
+# Save TF2 model
+os.makedirs('./save/', exist_ok=True)
+model.save_pretrained('./save/')
 
 # Load the TensorFlow model in PyTorch for inspection
-model.save_pretrained('./save/')
 pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
 
 # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
-sentence_0 = "This research was consistent with his findings."
-sentence_1 = "His findings were compatible with this research."
-sentence_2 = "His findings were not compatible with this research."
+sentence_0 = 'This research was consistent with his findings.'
+sentence_1 = 'His findings were compatible with this research.'
+sentence_2 = 'His findings were not compatible with this research.'
 inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
 inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
 
 pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
 pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
-print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
-print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
+print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
+print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')

From 376e65a67481bcd370c77b119773b11bb612b0c3 Mon Sep 17 00:00:00 2001
From: Timothy Liu <timothyl@nvidia.com>
Date: Sun, 13 Oct 2019 11:04:49 +0000
Subject: [PATCH 035/144] Added automatic mixed precision and XLA options to
 run_tf_glue.py

---
 examples/run_tf_glue.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py
index c05420d680..399fe9e616 100644
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -6,6 +6,11 @@ from transformers import BertTokenizer, TFBertForSequenceClassification, glue_co
 # script parameters
 BATCH_SIZE = 32
 EVAL_BATCH_SIZE = BATCH_SIZE * 2
+USE_XLA = False
+USE_AMP = False
+
+tf.config.optimizer.set_jit(USE_XLA)
+tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
 
 # Load tokenizer and model from pretrained model/vocabulary
 tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
@@ -23,10 +28,13 @@ train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
 valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
 
 # Prepare training: Compile tf.keras model with optimizer, loss and learning rate schedule 
-optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
+opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
+if USE_AMP:
+    # loss scaling is currently required when using mixed precision
+    opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
 loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
 metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
-model.compile(optimizer=optimizer, loss=loss, metrics=[metric])
+model.compile(optimizer=opt, loss=loss, metrics=[metric])
 
 # Train and evaluate using tf.keras.Model.fit()
 train_steps = train_examples//BATCH_SIZE

From 099358675899f759110ad8ccecc22c2fab9b1888 Mon Sep 17 00:00:00 2001
From: JulianPani <julian.pani@kenshoo.com>
Date: Mon, 14 Oct 2019 02:09:53 +0300
Subject: [PATCH 036/144] remove usage of DUMMY_INPUTS

Hey @thomwolf
This change https://github.com/huggingface/transformers/commit/da26bae61b8c1e741fdc6735d46c61b43f649561#diff-8ddce309e88e8eb5b4d02228fd8881daL28-L29 removed the constant, but one usage of that constant remains in the code.
---
 transformers/modeling_tf_pytorch_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py
index 5a70d9a72b..88ce4d4610 100644
--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -198,7 +198,7 @@ def load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path, tf_inputs
     tf_model = tf_model_class(pt_model.config)
 
     if tf_inputs is None:
-        tf_inputs = tf.constant(DUMMY_INPUTS)
+        tf_inputs = tf_model.dummy_inputs
 
     if tf_inputs is not None:
         tfo = tf_model(tf_inputs, training=False)  # Make sure model is built

From 0ef9bc923a3fa3f12d39a516aec2069e9ffc4e6e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 14 Oct 2019 11:58:13 +0200
Subject: [PATCH 037/144] Cleaning up seq2seq [WIP]

---
 transformers/modeling_bert.py    | 284 +++----------------------------
 transformers/modeling_seq2seq.py | 249 +++++++++++++++++++++++++++
 2 files changed, 273 insertions(+), 260 deletions(-)
 create mode 100644 transformers/modeling_seq2seq.py

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 03559ad26c..fbf3c84646 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -199,12 +199,14 @@ class BertSelfAttention(nn.Module):
         return x.permute(0, 2, 1, 3)
 
     def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None):
-        mixed_key_layer = self.key(hidden_states)
-        mixed_value_layer = self.value(hidden_states)
-        if encoder_hidden_states is not None:  # if encoder-decoder attention
-            mixed_query_layer = self.query(encoder_hidden_states)
+        mixed_query_layer = self.query(hidden_states)
+        # if the attention Module is a encoder-decoder self attention module
+        if encoder_hidden_states is not None:
+            mixed_key_layer = self.key(encoder_hidden_states)
+            mixed_value_layer = self.value(encoder_hidden_states)
         else:
-            mixed_query_layer = self.query(hidden_states)
+            mixed_key_layer = self.key(hidden_states)
+            mixed_value_layer = self.value(hidden_states)
 
         query_layer = self.transpose_for_scores(mixed_query_layer)
         key_layer = self.transpose_for_scores(mixed_key_layer)
@@ -322,26 +324,25 @@ class BertLayer(nn.Module):
     def __init__(self, config):
         super(BertLayer, self).__init__()
         self.attention = BertAttention(config)
-        if getattr(config, "is_decoder", False):
+        self.is_decoder = config.is_decoder
+        if self.is_decoder:
             self.crossattention = BertAttention(config)
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
     def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_state=None):
-        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
-        attention_output = attention_outputs[0]
+        self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 
-        if encoder_hidden_state is not None:
-            try:
-                attention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_state)
-            except AttributeError as ae:
-                print("You need to set `is_encoder` to True in the configuration to instantiate an encoder layer:", ae)
-                raise
+        if self.is_decoder and encoder_hidden_state is not None:
+            cross_attention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_state)
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
 
-        attention_output = attention_outputs[0]
         intermediate_output = self.intermediate(attention_output)
         layer_output = self.output(intermediate_output, attention_output)
-        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+        outputs = (layer_output,) + outputs
         return outputs
 
 
@@ -352,14 +353,14 @@ class BertEncoder(nn.Module):
         self.output_hidden_states = config.output_hidden_states
         self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None):
         all_hidden_states = ()
         all_attentions = ()
         for i, layer_module in enumerate(self.layer):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
-            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i])
+            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], encoder_hidden_states)
             hidden_states = layer_outputs[0]
 
             if self.output_attentions:
@@ -377,42 +378,6 @@ class BertEncoder(nn.Module):
         return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 
 
-class BertDecoder(nn.Module):
-    def __init__(self, config):
-        super(BertDecoder, self).__init__()
-        config.is_decoder = True
-        self.output_attentions = config.output_attentions
-        self.output_hidden_states = config.output_hidden_states
-        self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
-
-    def forward(self, hidden_states, encoder_outputs, attention_mask=None, head_mask=None):
-        all_hidden_states = ()
-        all_attentions = ()
-        for i, layer_module in enumerate(self.layer):
-            if self.output_hidden_states:
-                all_hidden_states = all_hidden_states + (hidden_states,)
-
-            layer_outputs = layer_module(hidden_states,
-                                         attention_mask=attention_mask,
-                                         head_mask=head_mask[i],
-                                         encoder_hidden_state=encoder_outputs)
-            if self.output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-
-            hidden_states = layer_outputs[0]
-
-        # Add last layer
-        if self.output_hidden_states:
-            all_hidden_states = all_hidden_states + (hidden_states,)
-
-        outputs = (hidden_states,)
-        if self.output_hidden_states:
-            outputs = outputs + (all_hidden_states,)
-        if self.output_attentions:
-            outputs = outputs + (all_attentions,)
-        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
-
-
 class BertPooler(nn.Module):
     def __init__(self, config):
         super(BertPooler, self).__init__()
@@ -635,7 +600,8 @@ class BertModel(BertPreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None,
+                head_mask=None, encoder_hidden_state=None):
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
         if token_type_ids is None:
@@ -673,8 +639,9 @@ class BertModel(BertPreTrainedModel):
 
         embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
         encoder_outputs = self.encoder(embedding_output,
-                                       extended_attention_mask,
-                                       head_mask=head_mask)
+                                       attention_mask=extended_attention_mask,
+                                       head_mask=head_mask,
+                                       encoder_hidden_state=encoder_hidden_state)
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
 
@@ -682,111 +649,6 @@ class BertModel(BertPreTrainedModel):
         return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
 
 
-@add_start_docstrings("""A bare Bert decoder Model transformer outputting raw hidden-states without any specific head on top.
-                      The model follows the general transformer decoder architecture.""",
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
-class BertDecoderModel(BertPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
-            Sequence of hidden-states at the output of the last layer of the model.
-        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during Bert pretraining. This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = BertDecoderModel.from_pretrained('bert-base-uncased')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
-        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
-
-    """
-    def __init__(self, config):
-        super(BertDecoderModel, self).__init__(config)
-
-        self.embeddings = BertEmbeddings(config)
-        self.decoder = BertDecoder(config)
-        self.pooler = BertPooler(config)
-
-        self.init_weights()
-
-    def _resize_token_embeddings(self, new_num_tokens):
-        old_embeddings = self.embeddings.word_embeddings
-        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
-        self.embeddings.word_embeddings = new_embeddings
-        return self.embeddings.word_embeddings
-
-    def _prune_heads(self, heads_to_prune):
-        """ Prunes heads of the model.
-            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
-            See base class PreTrainedModel
-        """
-        for layer, heads in heads_to_prune.items():
-            self.decoder.layer[layer].attention.prune_heads(heads)
-            self.decoder.layer[layer].crossattention.prune_heads(heads)
-
-    def forward(self, input_ids, encoder_outputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
-
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype)  # switch to fload if need + fp16 compatibility
-        else:
-            head_mask = [None] * self.config.num_hidden_layers
-
-        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
-        decoder_outputs = self.decoder(embedding_output,
-                                       encoder_outputs,
-                                       extended_attention_mask,
-                                       head_mask=head_mask)
-        sequence_output = decoder_outputs[0]
-        pooled_output = self.pooler(sequence_output)
-
-        outputs = (sequence_output, pooled_output,) + decoder_outputs[1:]  # add hidden_states and attentions if they are here
-        return outputs  # sequence_output, pooled_output, (hidden_states), (attentions)
-
-
 @add_start_docstrings("""Bert Model with two heads on top as done during the pre-training:
                        a `masked language modeling` head and a `next sentence prediction (classification)` head. """,
                       BERT_START_DOCSTRING,
@@ -1309,101 +1171,3 @@ class BertForQuestionAnswering(BertPreTrainedModel):
             outputs = (total_loss,) + outputs
 
         return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings("Bert encoder-decoder model for sequence generation.",
-                      BERT_START_DOCSTRING,
-                      BERT_INPUTS_DOCSTRING)
-class Bert2Rnd(BertPreTrainedModel):
-    r"""
-
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
-        model = Bert2Rnd.from_pretrained('bert-base-uncased')
-        # fine-tuning magic happens here
-        input = tokenizer.encode("Hello, how are you?")
-        outputs = model(input)
-        output_text = tokenize.decode(outputs[0])
-        print(output_text)
-
-    References::
-
-    [1] "Leveraging Pre-trained Checkpoints for Sequence Generation Tasks", S.Rothe, S.Narayan & A.Severyn (2019) ArXiV:1907.12461v1
-    [2] Tensor2Tensor library https://github.com/tensorflow/tensor2tensor
-
-    """
-
-    def __init__(self, config):
-        super(Bert2Rnd, self).__init__(config)
-        self.encoder = BertModel(config)
-        self.decoder = BertDecoderModel(config)
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_or_path, *model_args, **model_kwargs):
-        """ Load the pretrained weights in the encoder.
-
-        The encoder of `Bert2Rand` is initialized with pretrained weights; the
-        weights of the decoder are initialized at random except the embeddings
-        which are initialized with the pretrained embeddings. We thus need to override
-        the base class' `from_pretrained` method.
-        """
-
-        # Load the configuration
-        config = model_kwargs.pop('config', None)
-        if config is None:
-            cache_dir = model_kwargs.pop('cache_dir', None)
-            force_download = model_kwargs.pop('force_download', False)
-            config, _ = cls.config_class.from_pretrained(
-                pretrained_model_or_path,
-                *model_args,
-                cache_dir=cache_dir,
-                return_unused_kwargs=True,
-                force_download=force_download,
-                **model_kwargs
-            )
-        model = cls(config)
-
-        # We load the encoder with pretrained weights
-        pretrained_encoder = BertModel.from_pretrained(pretrained_model_or_path, *model_args, **model_kwargs)
-        model.encoder = pretrained_encoder
-
-        # We load the decoder with pretrained weights and then randomize all weights but embeddings-related one.
-        def randomize_decoder_weights(module):
-            if isinstance(module, nn.Linear):
-                # Slightly different from the TF version which uses truncated_normal for initialization
-                # cf https://github.com/pytorch/pytorch/pull/5617
-                module.weight.data.normal_(mean=0.0, std=config.initializer_range)
-            elif isinstance(module, BertLayerNorm):
-                module.bias.data.zero_()
-                module.weight.data.fill_(1.0)
-            if isinstance(module, nn.Linear) and module.bias is not None:
-                module.bias.data.zero_()
-
-        pretrained_decoder = BertDecoderModel.from_pretrained(pretrained_model_or_path, *model_args, **model_kwargs)
-        pretrained_decoder.apply(randomize_decoder_weights)
-        model.decoder = pretrained_decoder
-
-        return model
-
-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
-        encoder_outputs = self.encoder(input_ids,
-                                       attention_mask=attention_mask,
-                                       token_type_ids=token_type_ids,
-                                       position_ids=position_ids,
-                                       head_mask=head_mask)
-        decoder_outputs = self.decoder(input_ids,
-                                       encoder_outputs[0],
-                                       token_type_ids=token_type_ids,
-                                       position_ids=position_ids,
-                                       head_mask=head_mask)
-        return decoder_outputs
diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
new file mode 100644
index 0000000000..50891ddded
--- /dev/null
+++ b/transformers/modeling_seq2seq.py
@@ -0,0 +1,249 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Auto Model class. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import logging
+
+import torch
+from torch import nn
+
+from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
+from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
+from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
+from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
+from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
+from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
+from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
+from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
+
+from .modeling_utils import PreTrainedModel, SequenceSummary
+
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+
+class PreTrainedSeq2seq(nn.Module):
+    r"""
+        :class:`~transformers.Seq2seq` is a generic model class
+        that will be instantiated as a Seq2seq model with one of the base model classes of the library
+        as encoder and (optionally) as decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
+        class method.
+
+        The `from_pretrained()` method takes care of returning the correct model class instance
+        using pattern matching on the `pretrained_model_name_or_path` string.
+
+        The base model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertModel (DistilBERT model)
+            - contains `roberta`: RobertaModel (RoBERTa model)
+            - contains `bert`: BertModel (Bert model)
+            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
+            - contains `xlnet`: XLNetModel (XLNet model)
+            - contains `xlm`: XLMModel (XLM model)
+
+        This class cannot be instantiated using `__init__()` (throws an error).
+    """
+    def __init__(self, encoder, decoder):
+        super(PreTrainedSeq2seq, self).__init__()
+        self.encoder = encoder
+        self.decoder = decoder
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        r""" Instantiates one of the base model classes of the library
+        from a pre-trained model configuration.
+
+        The model class to instantiate is selected as the first pattern matching
+        in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `distilbert`: DistilBertModel (DistilBERT model)
+            - contains `roberta`: RobertaModel (RoBERTa model)
+            - contains `bert`: BertModel (Bert model)
+            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
+            - contains `xlnet`: XLNetModel (XLNet model)
+            - contains `xlm`: XLMModel (XLM model)
+
+            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+            To train the model, you should first set it back in training mode with `model.train()`
+
+        Params:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            model_args: (`optional`) Sequence of positional arguments:
+                All remaning positional arguments will be passed to the underlying model's ``__init__`` method
+
+            config: (`optional`) instance of a class derived from :class:`~transformers.PretrainedConfig`:
+                Configuration for the model to use instead of an automatically loaded configuation. Configuration can be automatically loaded when:
+
+                - the model is a model provided by the library (loaded with the ``shortcut-name`` string of a pretrained model), or
+                - the model was saved using :func:`~transformers.PreTrainedModel.save_pretrained` and is reloaded by suppling the save directory.
+                - the model is loaded by suppling a local directory as ``pretrained_model_name_or_path`` and a configuration JSON file named `config.json` is found in the directory.
+
+            state_dict: (`optional`) dict:
+                an optional state dictionnary for the model to use instead of a state dictionary loaded from saved weights file.
+                This option can be used if you want to create a model from a pretrained configuration but load your own weights.
+                In this case though, you should check if using :func:`~transformers.PreTrainedModel.save_pretrained` and :func:`~transformers.PreTrainedModel.from_pretrained` is not a simpler option.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                configuration should be cached if the standard cache should not be used.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model weights and configuration files and override the cached versions if they exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            output_loading_info: (`optional`) boolean:
+                Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
+
+            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+                Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
+
+                - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
+                - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
+
+        Examples::
+
+            model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
+            model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
+            model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
+            assert model.config.output_attention == True
+            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
+            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
+            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+
+        """
+        # Extract encoder and decoder model if provided
+        encoder_model = kwargs.pop('encoder_model', None)
+        decoder_model = kwargs.pop('decoder_model', None)
+
+        # Extract decoder kwargs so we only have encoder kwargs for now
+        if decoder_model is None:
+            decoder_pretrained_model_name_or_path = kwargs.pop('decoder_pretrained_model_name_or_path', pretrained_model_name_or_path)
+        decoder_kwargs = {}
+        for key in kwargs.keys():
+            if key.startswith('decoder_'):
+                decoder_kwargs[key.replace('decoder_', '')] = kwargs.pop(key)
+
+        # Load and initialize the decoder
+        if encoder_model:
+            encoder = encoder_model
+        else:
+            # Load and initialize the encoder
+            kwargs['is_decoder'] = False  # Make sure the encoder will be an encoder
+            if 'distilbert' in pretrained_model_name_or_path:
+                encoder = DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+            elif 'roberta' in pretrained_model_name_or_path:
+                encoder = RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+            elif 'bert' in pretrained_model_name_or_path:
+                encoder = BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+            elif 'openai-gpt' in pretrained_model_name_or_path:
+                encoder = OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+            elif 'gpt2' in pretrained_model_name_or_path:
+                encoder = GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+            elif 'transfo-xl' in pretrained_model_name_or_path:
+                encoder = TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+            elif 'xlnet' in pretrained_model_name_or_path:
+                encoder = XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+            elif 'xlm' in pretrained_model_name_or_path:
+                encoder = XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+            else:
+                raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                                "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                                "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+
+        # Load and initialize the decoder
+        if decoder_model:
+            decoder = decoder_model
+        else:
+            kwargs.update(decoder_kwargs)  # Replace encoder kwargs with decoder specific kwargs like config, state_dict, etc...
+            kwargs['is_decoder'] = True  # Make sure the decoder will be an decoder
+            if 'distilbert' in decoder_pretrained_model_name_or_path:
+                decoder = DistilBertModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
+            elif 'roberta' in decoder_pretrained_model_name_or_path:
+                decoder = RobertaModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
+            elif 'bert' in decoder_pretrained_model_name_or_path:
+                decoder = BertModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
+            elif 'openai-gpt' in decoder_pretrained_model_name_or_path:
+                decoder = OpenAIGPTModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
+            elif 'gpt2' in decoder_pretrained_model_name_or_path:
+                decoder = GPT2Model.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
+            elif 'transfo-xl' in decoder_pretrained_model_name_or_path:
+                decoder = TransfoXLModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
+            elif 'xlnet' in decoder_pretrained_model_name_or_path:
+                decoder = XLNetModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
+            elif 'xlm' in decoder_pretrained_model_name_or_path:
+                decoder = XLMModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
+            else:
+                raise ValueError("Unrecognized model identifier in {}. Should contains one of "
+                                "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
+                                "'xlm', 'roberta'".format(decoder_pretrained_model_name_or_path))
+
+        model = cls(encoder, decoder)
+        return model
+
+    def forward(self, *inputs, *kwargs):
+        # Extract decoder inputs
+        decoder_kwargs = {}
+        for key in kwargs.keys():
+            if key.startswith('decoder_'):
+                decoder_kwargs[key.replace('decoder_', '')] = kwargs.pop(key)
+
+        # Compute encoder hidden states if needed
+        encoder_hidden_states = kwargs.pop('encoder_hidden_states', None)
+        if encoder_hidden_states is None:
+            encoder_outputs = self.encoder(*inputs, *kwargs)
+            encoder_hidden_states = encoder_outputs[0]
+
+        # Decode
+        decoder_kwargs['encoder_hidden_states'] = encoder_hidden_states
+        decoder_outputs = self.decoder(**decoder_kwargs)
+
+        return decoder_outputs
+
+
+class Model2Model(PreTrainedSeq2seq):
+    def tie_weights():
+        # We should tie encoder and decoder embeddings if possible here
+        pass
+
+
+class Model2LSTM(PreTrainedSeq2seq):
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        if kwargs.get('decoder_model', None) is None:
+            # We will create a randomly initilized LSTM model as decoder
+            if 'decoder_config' not in kwargs:
+                raise ValueError("To load an LSTM in Seq2seq model, please supply either: "
+                                "    - a torch.nn.LSTM model as `decoder_model` parameter (`decoder_model=lstm_model`), or "
+                                "    - a dictionary of configuration parameters that will be used to initialize a
+                                "        torch.nn.LSTM model as `decoder_config` keyword argument. "
+                                "        E.g. `decoder_config=\{'input_size': 768, 'hidden_size': 768, 'num_layers': 2\}`")
+            kwargs['decoder_model'] = torch.nn.LSTM(kwarg.pop('decoder_config'))
+        model = super(Model2LSTM, cls).from_pretrained(*args, **kwargs)
+        return model
+

From bfbe68f0352a85c0dfff49c5fb0e8296f698f46e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 14 Oct 2019 12:04:23 +0200
Subject: [PATCH 038/144] update forward pass

---
 transformers/modeling_seq2seq.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
index 50891ddded..e8106f47f5 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_seq2seq.py
@@ -218,12 +218,14 @@ class PreTrainedSeq2seq(nn.Module):
         if encoder_hidden_states is None:
             encoder_outputs = self.encoder(*inputs, *kwargs)
             encoder_hidden_states = encoder_outputs[0]
+        else:
+            encoder_outputs = (,)
 
         # Decode
         decoder_kwargs['encoder_hidden_states'] = encoder_hidden_states
         decoder_outputs = self.decoder(**decoder_kwargs)
 
-        return decoder_outputs
+        return decoder_outputs + encoder_outputs
 
 
 class Model2Model(PreTrainedSeq2seq):

From b7141a1bc604b8f9512f89d8dc3ec9dcc062e895 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 14 Oct 2019 12:14:08 +0200
Subject: [PATCH 039/144] maxi simplication

---
 transformers/modeling_seq2seq.py | 75 ++------------------------------
 1 file changed, 3 insertions(+), 72 deletions(-)

diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
index e8106f47f5..12792c6e7a 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_seq2seq.py
@@ -21,14 +21,7 @@ import logging
 import torch
 from torch import nn
 
-from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
-from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
-from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
-from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
-from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
-from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
-from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
-from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
+from .modeling_auto import AutoModel, AutoModelWithLMHead
 
 from .modeling_utils import PreTrainedModel, SequenceSummary
 
@@ -43,22 +36,6 @@ class PreTrainedSeq2seq(nn.Module):
         that will be instantiated as a Seq2seq model with one of the base model classes of the library
         as encoder and (optionally) as decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
         class method.
-
-        The `from_pretrained()` method takes care of returning the correct model class instance
-        using pattern matching on the `pretrained_model_name_or_path` string.
-
-        The base model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: DistilBertModel (DistilBERT model)
-            - contains `roberta`: RobertaModel (RoBERTa model)
-            - contains `bert`: BertModel (Bert model)
-            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
-            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
-            - contains `xlnet`: XLNetModel (XLNet model)
-            - contains `xlm`: XLMModel (XLM model)
-
-        This class cannot be instantiated using `__init__()` (throws an error).
     """
     def __init__(self, encoder, decoder):
         super(PreTrainedSeq2seq, self).__init__()
@@ -69,18 +46,6 @@ class PreTrainedSeq2seq(nn.Module):
     def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
         r""" Instantiates one of the base model classes of the library
         from a pre-trained model configuration.
-
-        The model class to instantiate is selected as the first pattern matching
-        in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `distilbert`: DistilBertModel (DistilBERT model)
-            - contains `roberta`: RobertaModel (RoBERTa model)
-            - contains `bert`: BertModel (Bert model)
-            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
-            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
-            - contains `xlnet`: XLNetModel (XLNet model)
-            - contains `xlm`: XLMModel (XLM model)
-
             The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
             To train the model, you should first set it back in training mode with `model.train()`
 
@@ -155,26 +120,7 @@ class PreTrainedSeq2seq(nn.Module):
         else:
             # Load and initialize the encoder
             kwargs['is_decoder'] = False  # Make sure the encoder will be an encoder
-            if 'distilbert' in pretrained_model_name_or_path:
-                encoder = DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-            elif 'roberta' in pretrained_model_name_or_path:
-                encoder = RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-            elif 'bert' in pretrained_model_name_or_path:
-                encoder = BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-            elif 'openai-gpt' in pretrained_model_name_or_path:
-                encoder = OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-            elif 'gpt2' in pretrained_model_name_or_path:
-                encoder = GPT2Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-            elif 'transfo-xl' in pretrained_model_name_or_path:
-                encoder = TransfoXLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-            elif 'xlnet' in pretrained_model_name_or_path:
-                encoder = XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-            elif 'xlm' in pretrained_model_name_or_path:
-                encoder = XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-            else:
-                raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                                "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                                "'xlm', 'roberta'".format(pretrained_model_name_or_path))
+            encoder = AutoModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
         # Load and initialize the decoder
         if decoder_model:
@@ -182,22 +128,7 @@ class PreTrainedSeq2seq(nn.Module):
         else:
             kwargs.update(decoder_kwargs)  # Replace encoder kwargs with decoder specific kwargs like config, state_dict, etc...
             kwargs['is_decoder'] = True  # Make sure the decoder will be an decoder
-            if 'distilbert' in decoder_pretrained_model_name_or_path:
-                decoder = DistilBertModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
-            elif 'roberta' in decoder_pretrained_model_name_or_path:
-                decoder = RobertaModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
-            elif 'bert' in decoder_pretrained_model_name_or_path:
-                decoder = BertModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
-            elif 'openai-gpt' in decoder_pretrained_model_name_or_path:
-                decoder = OpenAIGPTModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
-            elif 'gpt2' in decoder_pretrained_model_name_or_path:
-                decoder = GPT2Model.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
-            elif 'transfo-xl' in decoder_pretrained_model_name_or_path:
-                decoder = TransfoXLModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
-            elif 'xlnet' in decoder_pretrained_model_name_or_path:
-                decoder = XLNetModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
-            elif 'xlm' in decoder_pretrained_model_name_or_path:
-                decoder = XLMModel.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
+            decoder = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
             else:
                 raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                                 "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "

From d9d387afce183364827da297f2160b84ee43d6fd Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 14 Oct 2019 12:14:40 +0200
Subject: [PATCH 040/144] clean up

---
 transformers/modeling_seq2seq.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
index 12792c6e7a..466a101f47 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_seq2seq.py
@@ -127,12 +127,8 @@ class PreTrainedSeq2seq(nn.Module):
             decoder = decoder_model
         else:
             kwargs.update(decoder_kwargs)  # Replace encoder kwargs with decoder specific kwargs like config, state_dict, etc...
-            kwargs['is_decoder'] = True  # Make sure the decoder will be an decoder
+            kwargs['is_decoder'] = True  # Make sure the decoder will be a decoder
             decoder = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
-            else:
-                raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                                "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                                "'xlm', 'roberta'".format(decoder_pretrained_model_name_or_path))
 
         model = cls(encoder, decoder)
         return model

From 67d10960ae0183b9fa375660ba3ffdd2bb7e959c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Mon, 14 Oct 2019 14:09:21 +0200
Subject: [PATCH 041/144] load and prepare CNN/Daily Mail data

We write a function to load an preprocess the CNN/Daily Mail dataset as
provided by Li Dong et al. The issue is that this dataset has already
been tokenized by the authors, so we actually need to find the original,
plain-text dataset if we want to apply it to all models.
---
 examples/run_seq2seq_finetuning.py | 108 ++++++++++++++++++++++++++++-
 1 file changed, 105 insertions(+), 3 deletions(-)

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index 7ad8e4df90..7941384506 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright 2018 The Microsoft Reseach team and The HuggingFace Inc. team.
 # Copyright (c) 2018 Microsoft and The HuggingFace Inc.  All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -32,10 +32,13 @@ Natural Language Understanding and Generation.” (May 2019) ArXiv:1905.03197
 
 import argparse
 import logging
+import pickle
 import random
+import os
 
 import numpy as np
 import torch
+from torch.utils.data import Dataset
 
 from transformers import BertConfig, Bert2Rnd, BertTokenizer
 
@@ -48,8 +51,107 @@ def set_seed(args):
     torch.manual_seed(args.seed)
 
 
+class TextDataset(Dataset):
+    """ Abstracts a dataset used to train seq2seq models.
+
+    A seq2seq dataset consists in two files:
+    - The source file that contains the source sequences, one line per sequence;
+    - The target file contains the target sequences, one line per sequence.
+
+    The matching betwen source and target sequences is made on the basis of line numbers.
+
+    CNN/Daily News:
+
+    The CNN/Daily News dataset downloaded from [1] consists of two files that
+    respectively contain the stories and the associated summaries. Each line
+    corresponds to a different story. The files contain WordPiece tokens.
+
+    train.src: the longest story contains 6966 tokens, the shortest 12.
+    Sentences are separated with `[SEP_i]` where i is an int between 0 and 9.
+
+    train.tgt: the longest summary contains 2467 tokens, the shortest 4.
+    Sentences are separated with `[X_SEP]` tokens.
+
+    [1] https://github.com/microsoft/unilm
+    """
+    def __init_(self, tokenizer, src_path='train.src', target_path='target.src' block_size=512):
+        assert os.path.isfile(file_path)
+        directory, filename = os.path.split(file_path)
+
+        cached_features_file = os.path.join(directory, "cached_lm_{}_{}".format(block_size, file_name)
+        if os.path.exists(cached_features_file):
+            logger.info("Loading features from cached file %s", cached_features_file)
+            with open(cached_features_file, "rb") as source:
+                self.examples = pickle.load(source)
+        else:
+            logger.info("Creating features from dataset at %s", directory)
+
+            self.examples = []
+            with open(src_path, encoding="utf-8") as source, open(target_path, encoding="utf-8") as target:
+                for line_src, line_tgt in zip(source, target)
+                    src_sequence = line_src.read()
+                    tgt_sequence = line_tgt.read()
+                    example = _truncate_and_concatenate(src_sequence, tgt_sequence, block_size)
+                    if example is not None:
+                        example = tokenizer.convert_tokens_to_ids(example)
+                        self.examples.append(example)
+
+            logger.info("Saving features into cache file %s", cached_features_file)
+            with open(cached_features_file, "wb") as sink:
+                pickle.dump(self.examples, sink, protocole=pickle.HIGHEST_PROTOCOL)
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self):
+        return torch.tensor(self.examples[items])
+
+
+def _truncate_and_concatenate(src_sequence, tgt_sequence, block_size):
+    """ Concatenate the sequences and adapt their lengths to the block size.
+
+    Following [1] we perform the following transformations:
+    - Add an [CLS] token at the beginning of the source sequence;
+    - Add an [EOS] token at the end of the source and target sequences;
+    - Concatenate the source and target + tokens sequence. If the concatenated sequence is
+      longer than 512 we follow the 75%/25% rule in [1]: limit the source sequence's length to 384
+      and the target sequence's length to 128.
+
+    [1] Dong, Li, et al. "Unified Language Model Pre-training for Natural
+    Language Understanding and Generation." arXiv preprint arXiv:1905.03197 (2019).
+    """
+    SRC_MAX_LENGTH = int(0.75 * block_size) - 2 # CLS and EOS token
+    TGT_MAX_LENGTH = block_size - SRC_MAX_LENGTH - 1 # EOS token
+
+    # the dataset contains special separator tokens that we remove for now.
+    # They are of the form `[SEP_i]` in the source file, and `[X_SEP]` in the
+    # target file.
+    src_tokens = list(filter(lambda t: "[SEP_" in t, src_sequence.split(" ")))
+    tgt_tokens = list(filter(lambda t: "_SEP]" in t, tgt_sequence.split(" ")))
+
+    # we dump the examples that are too small to fit in the block size for the
+    # sake of simplicity. You can modify this by adding model-specific padding.
+    if len(src_tokens) + len(src_tokens) + 3 < block_size:
+        return None
+
+    # the source sequence has `[SEP_i]` special tokens with i \in [0,9]. We keep them for now.
+    if len(src_tokens) > SRC_MAX_LENGTH
+        if len(tgt_tokens) > TGT_MAX_LENGTH:
+            src_tokens = src_tokens[:SRC_MAX_LENGTH]
+            tgt_tokens = tgt_tokens[:TGT_MAX_LENGTH]
+        else:
+            src_tokens = src_tokens[block_size - len(tgt_tokens) - 3]
+    else:
+        if len(tgt_tokens) > TGT_MAX_LENGTH:
+            tgt_tokens = tgt_tokens[block_size - len(src_tokens) - 3]
+
+    return ["[CLS]"] + src_tokens + ["[EOS]"] + tgt_tokens + ["[EOS]"]
+
+
+
 def load_and_cache_examples(args, tokenizer):
-    raise NotImplementedError
+    dataset = TextDataset(tokenizer, file_path=args.train_data_file)
+    return dataset
 
 
 def train(args, train_dataset, model, tokenizer):
@@ -102,4 +204,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From 4e6a55751a510c50347226653df68b07a9caa8c7 Mon Sep 17 00:00:00 2001
From: Simon Layton <slayton58@gmail.com>
Date: Fri, 13 Sep 2019 15:21:40 -0400
Subject: [PATCH 042/144] Force einsum to fp16

---
 examples/run_squad.py | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 43b65d2c3c..71c656a13d 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -138,8 +138,8 @@ def train(args, train_dataset, model, tokenizer):
             model.train()
             batch = tuple(t.to(args.device) for t in batch)
             inputs = {'input_ids':       batch[0],
-                      'attention_mask':  batch[1], 
-                      'start_positions': batch[3], 
+                      'attention_mask':  batch[1],
+                      'start_positions': batch[3],
                       'end_positions':   batch[4]}
             if args.model_type != 'distilbert':
                 inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
@@ -481,6 +481,16 @@ def main():
 
     logger.info("Training/evaluation parameters %s", args)
 
+    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
+    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
+    # remove the need for this code, but it is still valid.
+    if args.fp16:
+        try:
+            import apex
+            apex.amp.register_half_function(torch, 'einsum')
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+
     # Training
     if args.do_train:
         train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)

From 447fffb21ff41d531b714586e6fac9442594eda2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Mon, 14 Oct 2019 18:12:20 +0200
Subject: [PATCH 043/144] process the raw CNN/Daily Mail dataset

the data provided by Li Dong et al. were already tokenized, which means
that they are not compatible with  all the models in the library. We
thus process the raw data directly and tokenize them using the models'
tokenizers.
---
 examples/run_seq2seq_finetuning.py | 120 ++++++++++++++++++++---------
 1 file changed, 83 insertions(+), 37 deletions(-)

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index 7941384506..4a7042929e 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -17,9 +17,9 @@
 
 We use the procedure described in [1] to finetune models for sequence
 generation. Let S1 and S2 be the source and target sequence respectively; we
-pack them using the start of sequence [SOS] and end of sequence [EOS] token:
+pack them using the start of sequence [EOS] and end of sequence [EOS] token:
 
-    [SOS] S1 [EOS] S2 [EOS]
+    [CLS] S1 [EOS] S2 [EOS]
 
 We then mask a fixed percentage of token from S2 at random and learn to predict
 the masked words. [EOS] can be masked during finetuning so the model learns to
@@ -31,6 +31,7 @@ Natural Language Understanding and Generation.” (May 2019) ArXiv:1905.03197
 """
 
 import argparse
+import dequeue
 import logging
 import pickle
 import random
@@ -54,7 +55,7 @@ def set_seed(args):
 class TextDataset(Dataset):
     """ Abstracts a dataset used to train seq2seq models.
 
-    A seq2seq dataset consists in two files:
+    A seq2seq dataset consists of two files:
     - The source file that contains the source sequences, one line per sequence;
     - The target file contains the target sequences, one line per sequence.
 
@@ -62,43 +63,53 @@ class TextDataset(Dataset):
 
     CNN/Daily News:
 
-    The CNN/Daily News dataset downloaded from [1] consists of two files that
-    respectively contain the stories and the associated summaries. Each line
-    corresponds to a different story. The files contain WordPiece tokens.
+    The CNN/Daily News raw datasets are downloaded from [1]. They consist in stories stored
+    in different files where the summary sentences are indicated by the special `@highlight` token.
+    To process the data, untar both datasets in the same folder, and path the path to this
+    folder as the "train_data_file" argument. The formatting code was inspired by [2].
 
-    train.src: the longest story contains 6966 tokens, the shortest 12.
-    Sentences are separated with `[SEP_i]` where i is an int between 0 and 9.
-
-    train.tgt: the longest summary contains 2467 tokens, the shortest 4.
-    Sentences are separated with `[X_SEP]` tokens.
-
-    [1] https://github.com/microsoft/unilm
+    [1] https://cs.nyu.edu/~kcho/
+    [2] https://github.com/abisee/cnn-dailymail/
     """
-    def __init_(self, tokenizer, src_path='train.src', target_path='target.src' block_size=512):
-        assert os.path.isfile(file_path)
-        directory, filename = os.path.split(file_path)
+    def __init_(self, tokenizer, data_dir='', block_size=512):
+        assert os.path.isdir(data_dir)
 
-        cached_features_file = os.path.join(directory, "cached_lm_{}_{}".format(block_size, file_name)
+        # Load features that have already been computed if present
+        cached_features_file = os.path.join(directory, "cached_lm_{}_{}".format(block_size, data_dir)
         if os.path.exists(cached_features_file):
             logger.info("Loading features from cached file %s", cached_features_file)
             with open(cached_features_file, "rb") as source:
                 self.examples = pickle.load(source)
-        else:
-            logger.info("Creating features from dataset at %s", directory)
+                return
 
-            self.examples = []
-            with open(src_path, encoding="utf-8") as source, open(target_path, encoding="utf-8") as target:
-                for line_src, line_tgt in zip(source, target)
-                    src_sequence = line_src.read()
-                    tgt_sequence = line_tgt.read()
-                    example = _truncate_and_concatenate(src_sequence, tgt_sequence, block_size)
-                    if example is not None:
-                        example = tokenizer.convert_tokens_to_ids(example)
-                        self.examples.append(example)
+        logger.info("Creating features from dataset at %s", directory)
 
-            logger.info("Saving features into cache file %s", cached_features_file)
-            with open(cached_features_file, "wb") as sink:
-                pickle.dump(self.examples, sink, protocole=pickle.HIGHEST_PROTOCOL)
+        # we need to iterate over both the cnn and the dailymail dataset
+        datasets = ['cnn', 'dailymail']
+        for dataset in datasets:
+            path_to_stories = os.path.join(data_dir, dataset, "stories")
+            assert os.path.isdir(path_to_stories)
+
+            stories_files = os.listdir(path_to_stories)
+            for story_file in stories_files:
+                path_to_story = os.path.join(path_to_stories, "story_file")
+                if !os.path.isfile(path_to_story):
+                    continue
+
+                with open(path_to_story, encoding="utf-8") as source:
+                    try:
+                        story, summary = process_story(source)
+                    except IndexError:
+                        continue
+
+                src_sequence = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(story))
+                tgt_sequence = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(summary))
+                example = _truncate_and_concatenate(src_sequence, tgt_sequence, blocksize)
+                self.examples.append(example)
+
+        logger.info("Saving features into cache file %s", cached_features_file)
+        with open(cached_features_file, "wb") as sink:
+            pickle.dump(self.examples, sink, protocole=pickle.HIGHEST_PROTOCOL)
 
     def __len__(self):
         return len(self.examples)
@@ -107,6 +118,46 @@ class TextDataset(Dataset):
         return torch.tensor(self.examples[items])
 
 
+def process_story(story_file):
+    """ Process the text contained in a story file.
+    Returns the story and the summary
+    """
+    file_lines = list(filter(lambda x: len(x)!=0, [line.strip() for lines in story_file]))
+
+    # for some unknown reason some lines miss a period, add it
+    file_lines = [_add_missing_period(line) for line in file_lines]
+
+    # gather article lines
+    story_lines = []
+    lines = dequeue(file_lines)
+    while True:
+        try:
+            element = lines.popleft()
+            if element.startswith("@highlight"):
+                break
+            story_lines.append(element)
+        except IndexError as ie:  # if "@highlight" absent from file
+            raise ie
+
+    # gather summary lines
+    highlights_lines = list(filter(lambda t: !t.startswith("@highlight"), lines))
+
+    # join the lines
+    story = " ".join(story_lines)
+    summary = " ".join(highlights_lines)
+
+    return story, summary
+
+
+def _add_missing_period(line):
+    END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', u'\u2019', u'\u2019', ")"]
+    if line == "@highlight":
+        return line
+    if line[-1] in END_TOKENS:
+        return line
+    return line + " ."
+
+
 def _truncate_and_concatenate(src_sequence, tgt_sequence, block_size):
     """ Concatenate the sequences and adapt their lengths to the block size.
 
@@ -123,12 +174,6 @@ def _truncate_and_concatenate(src_sequence, tgt_sequence, block_size):
     SRC_MAX_LENGTH = int(0.75 * block_size) - 2 # CLS and EOS token
     TGT_MAX_LENGTH = block_size - SRC_MAX_LENGTH - 1 # EOS token
 
-    # the dataset contains special separator tokens that we remove for now.
-    # They are of the form `[SEP_i]` in the source file, and `[X_SEP]` in the
-    # target file.
-    src_tokens = list(filter(lambda t: "[SEP_" in t, src_sequence.split(" ")))
-    tgt_tokens = list(filter(lambda t: "_SEP]" in t, tgt_sequence.split(" ")))
-
     # we dump the examples that are too small to fit in the block size for the
     # sake of simplicity. You can modify this by adding model-specific padding.
     if len(src_tokens) + len(src_tokens) + 3 < block_size:
@@ -145,6 +190,7 @@ def _truncate_and_concatenate(src_sequence, tgt_sequence, block_size):
         if len(tgt_tokens) > TGT_MAX_LENGTH:
             tgt_tokens = tgt_tokens[block_size - len(src_tokens) - 3]
 
+    # I add the special tokens manually, but this should be done by the tokenizer. That's the next step.
     return ["[CLS]"] + src_tokens + ["[EOS]"] + tgt_tokens + ["[EOS]"]
 
 

From 412793275d3773ef0aab0e17b76a4010e7082656 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Mon, 14 Oct 2019 20:45:16 +0200
Subject: [PATCH 044/144] delegate the padding with special tokens to the
 tokenizer

---
 examples/run_seq2seq_finetuning.py | 53 +++++++++++++-----------------
 1 file changed, 22 insertions(+), 31 deletions(-)

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index 4a7042929e..5d7da58a23 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -53,20 +53,14 @@ def set_seed(args):
 
 
 class TextDataset(Dataset):
-    """ Abstracts a dataset used to train seq2seq models.
-
-    A seq2seq dataset consists of two files:
-    - The source file that contains the source sequences, one line per sequence;
-    - The target file contains the target sequences, one line per sequence.
-
-    The matching betwen source and target sequences is made on the basis of line numbers.
+    """ Abstracts the dataset used to train seq2seq models.
 
     CNN/Daily News:
 
     The CNN/Daily News raw datasets are downloaded from [1]. They consist in stories stored
     in different files where the summary sentences are indicated by the special `@highlight` token.
-    To process the data, untar both datasets in the same folder, and path the path to this
-    folder as the "train_data_file" argument. The formatting code was inspired by [2].
+    To process the data, untar both datasets in the same folder, and pass the path to this
+    folder as the "data_dir argument. The formatting code was inspired by [2].
 
     [1] https://cs.nyu.edu/~kcho/
     [2] https://github.com/abisee/cnn-dailymail/
@@ -82,9 +76,8 @@ class TextDataset(Dataset):
                 self.examples = pickle.load(source)
                 return
 
-        logger.info("Creating features from dataset at %s", directory)
+        logger.info("Creating features from dataset at %s", data_dir)
 
-        # we need to iterate over both the cnn and the dailymail dataset
         datasets = ['cnn', 'dailymail']
         for dataset in datasets:
             path_to_stories = os.path.join(data_dir, dataset, "stories")
@@ -102,9 +95,10 @@ class TextDataset(Dataset):
                     except IndexError:
                         continue
 
-                src_sequence = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(story))
-                tgt_sequence = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(summary))
-                example = _truncate_and_concatenate(src_sequence, tgt_sequence, blocksize)
+                story = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(story))
+                summary = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(summary))
+                story_seq, summary_seq = _fit_to_block_size(story, summary, blocksize)
+                example = tokenizer.add_special_token_sequence_pair(story_seq, summary_seq)
                 self.examples.append(example)
 
         logger.info("Saving features into cache file %s", cached_features_file)
@@ -158,15 +152,13 @@ def _add_missing_period(line):
     return line + " ."
 
 
-def _truncate_and_concatenate(src_sequence, tgt_sequence, block_size):
+def _fit_to_block_size(src_sequence, tgt_sequence, block_size):
     """ Concatenate the sequences and adapt their lengths to the block size.
 
-    Following [1] we perform the following transformations:
-    - Add an [CLS] token at the beginning of the source sequence;
-    - Add an [EOS] token at the end of the source and target sequences;
-    - Concatenate the source and target + tokens sequence. If the concatenated sequence is
-      longer than 512 we follow the 75%/25% rule in [1]: limit the source sequence's length to 384
-      and the target sequence's length to 128.
+    Following [1] we truncate the source and target + tokens sequences so they fit
+    in the block size. If the concatenated sequence is longer than 512 we follow
+    the 75%/25% rule in [1]: limit the source sequence's length to 384 and the
+    target sequence's length to 128.
 
     [1] Dong, Li, et al. "Unified Language Model Pre-training for Natural
     Language Understanding and Generation." arXiv preprint arXiv:1905.03197 (2019).
@@ -176,22 +168,21 @@ def _truncate_and_concatenate(src_sequence, tgt_sequence, block_size):
 
     # we dump the examples that are too small to fit in the block size for the
     # sake of simplicity. You can modify this by adding model-specific padding.
-    if len(src_tokens) + len(src_tokens) + 3 < block_size:
+    if len(src_sequence) + len(src_sequence) + 3 < block_size:
         return None
 
     # the source sequence has `[SEP_i]` special tokens with i \in [0,9]. We keep them for now.
-    if len(src_tokens) > SRC_MAX_LENGTH
-        if len(tgt_tokens) > TGT_MAX_LENGTH:
-            src_tokens = src_tokens[:SRC_MAX_LENGTH]
-            tgt_tokens = tgt_tokens[:TGT_MAX_LENGTH]
+    if len(src_sequence) > SRC_MAX_LENGTH
+        if len(tgt_sequence) > TGT_MAX_LENGTH:
+            src_sequence = src_sequence[:SRC_MAX_LENGTH]
+            tgt_sequence = tgt_sequence[:TGT_MAX_LENGTH]
         else:
-            src_tokens = src_tokens[block_size - len(tgt_tokens) - 3]
+            src_sequence = src_sequence[block_size - len(tgt_sequence) - 3]
     else:
         if len(tgt_tokens) > TGT_MAX_LENGTH:
-            tgt_tokens = tgt_tokens[block_size - len(src_tokens) - 3]
+            tgt_sequence = tgt_sequence[block_size - len(src_sequence) - 3]
 
-    # I add the special tokens manually, but this should be done by the tokenizer. That's the next step.
-    return ["[CLS]"] + src_tokens + ["[EOS]"] + tgt_tokens + ["[EOS]"]
+    return src_sequence, tgt_sequence
 
 
 
@@ -250,4 +241,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From fe25eefc1589a0362e1b60c30734f88f666aff5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Mon, 14 Oct 2019 20:45:39 +0200
Subject: [PATCH 045/144] add instructions to fetch the dataset

---
 examples/README.md | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index fb5de20a2a..ba58a61012 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -10,6 +10,7 @@ similar API between the different models.
 | [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
 | [SQuAD](#squad) | Using BERT for question answering, examples with distributed training.                                                                                  |
 | [Multiple Choice](#multiple choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
+| [Seq2seq Model fine-tuning](#seq2seq-model-fine-tuning) | Fine-tuning the library models for seq2seq tasks on the CNN/Daily Mail dataset. |
 
 ## Language model fine-tuning
 
@@ -387,6 +388,30 @@ f1 = 93.15
 exact_match = 86.91
 ```
 
-This fine-tuneds model is available as a checkpoint under the reference
+This fine-tuned model is available as a checkpoint under the reference
 `bert-large-uncased-whole-word-masking-finetuned-squad`.
 
+## Seq2seq model fine-tuning
+
+Based on the script [`run_seq2seq_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_seq2seq_finetuning.py).
+
+Before running this script you should download **both** CNN and Daily Mail datasets (the links next to "Stories") from [Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) in the same folder. Then uncompress the archives by running:
+
+```bash
+tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
+```
+
+We will refer as `$DATA_PATH` the path to where you uncompressed both archive.
+
+## Bert2Bert and abstractive summarization
+
+```bash
+export DATA_PATH=/path/to/dataset/
+
+python run_seq2seq_finetuning.py \
+    --output_dir=output \
+    --model_type=bert2bert \
+    --model_name_or_path=bert2bert \
+    --do_train \
+    --data_path=$DATA_PATH \
+```
\ No newline at end of file

From cde42c43544f3e5d9a1b8f29fb0e3f56625a99f8 Mon Sep 17 00:00:00 2001
From: Marianne Stecklina <marianne@omnius.com>
Date: Tue, 17 Sep 2019 15:18:57 +0200
Subject: [PATCH 046/144] Implement fine-tuning BERT on CoNLL-2003 named entity
 recognition task

---
 examples/run_ner.py   | 482 ++++++++++++++++++++++++++++++++++++++++++
 examples/utils_ner.py | 206 ++++++++++++++++++
 2 files changed, 688 insertions(+)
 create mode 100644 examples/run_ner.py
 create mode 100644 examples/utils_ner.py

diff --git a/examples/run_ner.py b/examples/run_ner.py
new file mode 100644
index 0000000000..ce048ade18
--- /dev/null
+++ b/examples/run_ner.py
@@ -0,0 +1,482 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert). """
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import glob
+import logging
+import os
+import random
+
+import numpy as np
+import torch
+from seqeval.metrics import precision_score, recall_score, f1_score
+from tensorboardX import SummaryWriter
+from torch.nn import CrossEntropyLoss
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
+from torch.utils.data.distributed import DistributedSampler
+from tqdm import tqdm, trange
+from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
+
+from pytorch_transformers import AdamW, WarmupLinearSchedule
+from pytorch_transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
+
+logger = logging.getLogger(__name__)
+
+ALL_MODELS = sum(
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, )),
+    ())
+
+MODEL_CLASSES = {
+    "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
+}
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+
+def train(args, train_dataset, model, tokenizer, pad_token_label_id):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
+         "weight_decay": args.weight_decay},
+        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0}
+    ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                args.train_batch_size * args.gradient_accumulation_steps * (
+                    torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {"input_ids": batch[0],
+                      "attention_mask": batch[1],
+                      "token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None,
+                      # XLM and RoBERTa don"t use segment_ids
+                      "labels": batch[3]}
+            outputs = model(**inputs)
+            loss = outputs[0]  # model outputs are always tuple in pytorch-transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean()  # mean() to average on multi-gpu parallel training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+            else:
+                loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                scheduler.step()  # Update learning rate schedule
+                optimizer.step()
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer, pad_token_label_id)
+                        for key, value in results.items():
+                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
+                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model,
+                                                            "module") else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, pad_token_label_id, prefix=""):
+    eval_dataset = load_and_cache_examples(args, tokenizer, pad_token_label_id, evaluate=True)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(eval_dataset) if args.local_rank == -1 else DistributedSampler(eval_dataset)
+    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Eval!
+    logger.info("***** Running evaluation %s *****", prefix)
+    logger.info("  Num examples = %d", len(eval_dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    eval_loss = 0.0
+    nb_eval_steps = 0
+    preds = None
+    out_label_ids = None
+    model.eval()
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        batch = tuple(t.to(args.device) for t in batch)
+
+        with torch.no_grad():
+            inputs = {"input_ids": batch[0],
+                      "attention_mask": batch[1],
+                      "token_type_ids": batch[2] if args.model_type in ["bert", "xlnet"] else None,
+                      # XLM and RoBERTa don"t use segment_ids
+                      "labels": batch[3]}
+            outputs = model(**inputs)
+            tmp_eval_loss, logits = outputs[:2]
+
+            eval_loss += tmp_eval_loss.item()
+        nb_eval_steps += 1
+        if preds is None:
+            preds = logits.detach().cpu().numpy()
+            out_label_ids = inputs["labels"].detach().cpu().numpy()
+        else:
+            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
+            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)
+
+    eval_loss = eval_loss / nb_eval_steps
+    preds = np.argmax(preds, axis=2)
+
+    label_map = {i: label for i, label in enumerate(get_labels())}
+
+    out_label_list = [[] for _ in range(out_label_ids.shape[0])]
+    preds_list = [[] for _ in range(out_label_ids.shape[0])]
+
+    for i in range(out_label_ids.shape[0]):
+        for j in range(out_label_ids.shape[1]):
+            if out_label_ids[i, j] != pad_token_label_id:
+                out_label_list[i].append(label_map[out_label_ids[i][j]])
+                preds_list[i].append(label_map[preds[i][j]])
+
+    results = {
+        "loss": eval_loss,
+        "precision": precision_score(out_label_list, preds_list),
+        "recall": recall_score(out_label_list, preds_list),
+        "f1": f1_score(out_label_list, preds_list)
+    }
+
+    logger.info("***** Eval results %s *****", prefix)
+    for key in sorted(results.keys()):
+        logger.info("  %s = %s", key, str(results[key]))
+
+    return results
+
+
+def load_and_cache_examples(args, tokenizer, pad_token_label_id, evaluate=False):
+    if args.local_rank not in [-1, 0] and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Load data features from cache or dataset file
+    cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format("dev" if evaluate else "train",
+        list(filter(None, args.model_name_or_path.split("/"))).pop(),
+        str(args.max_seq_length)))
+    if os.path.exists(cached_features_file):
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", args.data_dir)
+        label_list = get_labels()
+        examples = read_examples_from_file(args.data_dir, evaluate=evaluate)
+        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer,
+                                                cls_token_at_end=bool(args.model_type in ["xlnet"]),
+                                                # xlnet has a cls token at the end
+                                                cls_token=tokenizer.cls_token,
+                                                cls_token_segment_id=2 if args.model_type in ["xlnet"] else 0,
+                                                sep_token=tokenizer.sep_token,
+                                                sep_token_extra=bool(args.model_type in ["roberta"]),
+                                                # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
+                                                pad_on_left=bool(args.model_type in ["xlnet"]),
+                                                # pad on the left for xlnet
+                                                pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
+                                                pad_token_segment_id=4 if args.model_type in ["xlnet"] else 0,
+                                                pad_token_label_id=pad_token_label_id
+                                                )
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    if args.local_rank == 0 and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    all_label_ids = torch.tensor([f.label_ids for f in features], dtype=torch.long)
+
+    dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids, all_label_ids)
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--data_dir", default=None, type=str, required=True,
+                        help="The input data dir. Should contain the training files for the CoNLL-2003 NER task.")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model predictions and checkpoints will be written.")
+
+    ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+    parser.add_argument("--max_seq_length", default=128, type=int,
+                        help="The maximum total input sequence length after tokenization. Sequences longer "
+                             "than this will be truncated, sequences shorter will be padded.")
+    parser.add_argument("--do_train", action="store_true",
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action="store_true",
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action="store_true",
+                        help="Whether to run evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", action="store_true",
+                        help="Set this flag if you are using an uncased model.")
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight decay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
+
+    parser.add_argument("--logging_steps", type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument("--save_steps", type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action="store_true",
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action="store_true",
+                        help="Avoid using CUDA when available")
+    parser.add_argument("--overwrite_output_dir", action="store_true",
+                        help="Overwrite the content of the output directory")
+    parser.add_argument("--overwrite_cache", action="store_true",
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument("--seed", type=int, default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument("--fp16", action="store_true",
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument("--fp16_opt_level", type=str, default="O1",
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="For distributed training: local_rank")
+    parser.add_argument("--server_ip", type=str, default="", help="For distant debugging.")
+    parser.add_argument("--server_port", type=str, default="", help="For distant debugging.")
+    args = parser.parse_args()
+
+    if os.path.exists(args.output_dir) and os.listdir(
+            args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(
+                args.output_dir))
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend="nccl")
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+                        datefmt="%m/%d/%Y %H:%M:%S",
+                        level=logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                   args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+
+    # Set seed
+    set_seed(args)
+
+    # Prepare CONLL-2003 task
+    label_list = get_labels()
+    num_labels = len(label_list)
+    # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
+    pad_token_label_id = CrossEntropyLoss().ignore_index
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
+                                          num_labels=num_labels)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path,
+                                                do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool(".ckpt" in args.model_name_or_path),
+                                        config=config)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, tokenizer, pad_token_label_id, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer, pad_token_label_id)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+    # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, "training_args.bin"))
+
+    # Evaluation
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + "/**/" + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("pytorch_transformers.modeling_utils").setLevel(logging.WARN)  # Reduce logging
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+            result = evaluate(args, model, tokenizer, pad_token_label_id, prefix=global_step)
+            if global_step:
+                result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
+            results.update(result)
+        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+        with open(output_eval_file, "w") as writer:
+            for key in sorted(results.keys()):
+                writer.write("{} = {}\n".format(key, str(results[key])))
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/utils_ner.py b/examples/utils_ner.py
new file mode 100644
index 0000000000..0d3af3e061
--- /dev/null
+++ b/examples/utils_ner.py
@@ -0,0 +1,206 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Named entity recognition fine-tuning: utilities to work with CoNLL-2003 task. """
+
+from __future__ import absolute_import, division, print_function
+
+import logging
+import os
+from io import open
+
+logger = logging.getLogger(__name__)
+
+
+class InputExample(object):
+    """A single training/test example for token classification."""
+
+    def __init__(self, guid, words, labels):
+        """Constructs a InputExample.
+
+        Args:
+            guid: Unique id for the example.
+            words: list. The words of the sequence.
+            labels: (Optional) list. The labels for each word of the sequence. This should be
+            specified for train and dev examples, but not for test examples.
+        """
+        self.guid = guid
+        self.words = words
+        self.labels = labels
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self, input_ids, input_mask, segment_ids, label_ids):
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.label_ids = label_ids
+
+
+def read_examples_from_file(data_dir, evaluate=False):
+    if evaluate:
+        file_path = os.path.join(data_dir, "dev.txt")
+        guid_prefix = "dev"
+    else:
+        file_path = os.path.join(data_dir, "train.txt")
+        guid_prefix = "train"
+    guid_index = 1
+    examples = []
+    with open(file_path, encoding="utf-8") as f:
+        words = []
+        labels = []
+        for line in f:
+            if line.startswith("-DOCSTART-") or line == "" or line == "\n":
+                if words:
+                    examples.append(InputExample(guid="{}-{}".format(guid_prefix, guid_index),
+                                                 words=words,
+                                                 labels=labels))
+                    guid_index += 1
+                    words = []
+                    labels = []
+            else:
+                splits = line.split(" ")
+                words.append(splits[0])
+                labels.append(splits[-1][:-1])
+        if words:
+            examples.append(InputExample(guid="%s-%d".format(guid_prefix, guid_index),
+                                         words=words,
+                                         labels=labels))
+    return examples
+
+
+def convert_examples_to_features(examples,
+                                 label_list,
+                                 max_seq_length,
+                                 tokenizer,
+                                 cls_token_at_end=False,
+                                 cls_token="[CLS]",
+                                 cls_token_segment_id=1,
+                                 sep_token="[SEP]",
+                                 sep_token_extra=False,
+                                 pad_on_left=False,
+                                 pad_token=0,
+                                 pad_token_segment_id=0,
+                                 pad_token_label_id=-1,
+                                 sequence_a_segment_id=0,
+                                 mask_padding_with_zero=True):
+    """ Loads a data file into a list of `InputBatch`s
+        `cls_token_at_end` define the location of the CLS token:
+            - False (Default, BERT/XLM pattern): [CLS] + A + [SEP] + B + [SEP]
+            - True (XLNet/GPT pattern): A + [SEP] + B + [SEP] + [CLS]
+        `cls_token_segment_id` define the segment id associated to the CLS token (0 for BERT, 2 for XLNet)
+    """
+
+    label_map = {label: i for i, label in enumerate(label_list)}
+
+    features = []
+    for (ex_index, example) in enumerate(examples):
+        if ex_index % 10000 == 0:
+            logger.info("Writing example %d of %d", ex_index, len(examples))
+
+        tokens = []
+        label_ids = []
+        for word, label in zip(example.words, example.labels):
+            word_tokens = tokenizer.tokenize(word)
+            tokens.extend(word_tokens)
+            # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+            label_ids.extend([label_map[label]] + [pad_token_label_id] * (len(word_tokens) - 1))
+
+        # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
+        special_tokens_count = 3 if sep_token_extra else 2
+        if len(tokens) > max_seq_length - special_tokens_count:
+            tokens = tokens[:(max_seq_length - special_tokens_count)]
+            label_ids = label_ids[:(max_seq_length - special_tokens_count)]
+
+        # The convention in BERT is:
+        # (a) For sequence pairs:
+        #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+        #  type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1
+        # (b) For single sequences:
+        #  tokens:   [CLS] the dog is hairy . [SEP]
+        #  type_ids:   0   0   0   0  0     0   0
+        #
+        # Where "type_ids" are used to indicate whether this is the first
+        # sequence or the second sequence. The embedding vectors for `type=0` and
+        # `type=1` were learned during pre-training and are added to the wordpiece
+        # embedding vector (and position vector). This is not *strictly* necessary
+        # since the [SEP] token unambiguously separates the sequences, but it makes
+        # it easier for the model to learn the concept of sequences.
+        #
+        # For classification tasks, the first vector (corresponding to [CLS]) is
+        # used as as the "sentence vector". Note that this only makes sense because
+        # the entire model is fine-tuned.
+        tokens += [sep_token]
+        label_ids += [pad_token_label_id]
+        if sep_token_extra:
+            # roberta uses an extra separator b/w pairs of sentences
+            tokens += [sep_token]
+            label_ids += [pad_token_label_id]
+        segment_ids = [sequence_a_segment_id] * len(tokens)
+
+        if cls_token_at_end:
+            tokens += [cls_token]
+            label_ids += [pad_token_label_id]
+            segment_ids += [cls_token_segment_id]
+        else:
+            tokens = [cls_token] + tokens
+            label_ids = [pad_token_label_id] + label_ids
+            segment_ids = [cls_token_segment_id] + segment_ids
+
+        input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+        # The mask has 1 for real tokens and 0 for padding tokens. Only real
+        # tokens are attended to.
+        input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+        # Zero-pad up to the sequence length.
+        padding_length = max_seq_length - len(input_ids)
+        if pad_on_left:
+            input_ids = ([pad_token] * padding_length) + input_ids
+            input_mask = ([0 if mask_padding_with_zero else 1] * padding_length) + input_mask
+            segment_ids = ([pad_token_segment_id] * padding_length) + segment_ids
+            label_ids = ([pad_token_label_id] * padding_length) + label_ids
+        else:
+            input_ids += ([pad_token] * padding_length)
+            input_mask += ([0 if mask_padding_with_zero else 1] * padding_length)
+            segment_ids += ([pad_token_segment_id] * padding_length)
+            label_ids += ([pad_token_label_id] * padding_length)
+
+        assert len(input_ids) == max_seq_length
+        assert len(input_mask) == max_seq_length
+        assert len(segment_ids) == max_seq_length
+        assert len(label_ids) == max_seq_length
+
+        if ex_index < 5:
+            logger.info("*** Example ***")
+            logger.info("guid: %s", example.guid)
+            logger.info("tokens: %s", " ".join([str(x) for x in tokens]))
+            logger.info("input_ids: %s", " ".join([str(x) for x in input_ids]))
+            logger.info("input_mask: %s", " ".join([str(x) for x in input_mask]))
+            logger.info("segment_ids: %s", " ".join([str(x) for x in segment_ids]))
+            logger.info("label_ids: %s", " ".join([str(x) for x in label_ids]))
+
+        features.append(
+                InputFeatures(input_ids=input_ids,
+                              input_mask=input_mask,
+                              segment_ids=segment_ids,
+                              label_ids=label_ids))
+    return features
+
+
+def get_labels():
+    return ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

From 3e9420add1e74fb4e900e3cfee415e77343eae41 Mon Sep 17 00:00:00 2001
From: Marianne Stecklina <marianne@omnius.com>
Date: Thu, 19 Sep 2019 09:28:00 +0200
Subject: [PATCH 047/144] Make file reading more robust

---
 examples/utils_ner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/utils_ner.py b/examples/utils_ner.py
index 0d3af3e061..39f6d08149 100644
--- a/examples/utils_ner.py
+++ b/examples/utils_ner.py
@@ -75,7 +75,7 @@ def read_examples_from_file(data_dir, evaluate=False):
             else:
                 splits = line.split(" ")
                 words.append(splits[0])
-                labels.append(splits[-1][:-1])
+                labels.append(splits[-1].replace("\n", ""))
         if words:
             examples.append(InputExample(guid="%s-%d".format(guid_prefix, guid_index),
                                          words=words,

From 99b189df6de71b2f01d6f72e6b1f4aa74455275b Mon Sep 17 00:00:00 2001
From: Marianne Stecklina <marianne@omnius.com>
Date: Thu, 19 Sep 2019 11:29:20 +0200
Subject: [PATCH 048/144] Add cli argument for configuring labels

---
 examples/run_ner.py   | 30 +++++++++++++++---------------
 examples/utils_ner.py | 11 +++++++++--
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/examples/run_ner.py b/examples/run_ner.py
index ce048ade18..f51f5ae2a1 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -55,7 +55,7 @@ def set_seed(args):
         torch.cuda.manual_seed_all(args.seed)
 
 
-def train(args, train_dataset, model, tokenizer, pad_token_label_id):
+def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
     """ Train the model """
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
@@ -148,7 +148,7 @@ def train(args, train_dataset, model, tokenizer, pad_token_label_id):
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
                     if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer, pad_token_label_id)
+                        results = evaluate(args, model, tokenizer, labels, pad_token_label_id)
                         for key, value in results.items():
                             tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                     tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
@@ -160,8 +160,7 @@ def train(args, train_dataset, model, tokenizer, pad_token_label_id):
                     output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model,
-                                                            "module") else model  # Take care of distributed/parallel training
+                    model_to_save = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
@@ -179,8 +178,8 @@ def train(args, train_dataset, model, tokenizer, pad_token_label_id):
     return global_step, tr_loss / global_step
 
 
-def evaluate(args, model, tokenizer, pad_token_label_id, prefix=""):
-    eval_dataset = load_and_cache_examples(args, tokenizer, pad_token_label_id, evaluate=True)
+def evaluate(args, model, tokenizer, labels, pad_token_label_id, prefix=""):
+    eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, evaluate=True)
 
     args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
     # Note that DistributedSampler samples randomly
@@ -220,7 +219,7 @@ def evaluate(args, model, tokenizer, pad_token_label_id, prefix=""):
     eval_loss = eval_loss / nb_eval_steps
     preds = np.argmax(preds, axis=2)
 
-    label_map = {i: label for i, label in enumerate(get_labels())}
+    label_map = {i: label for i, label in enumerate(labels)}
 
     out_label_list = [[] for _ in range(out_label_ids.shape[0])]
     preds_list = [[] for _ in range(out_label_ids.shape[0])]
@@ -245,7 +244,7 @@ def evaluate(args, model, tokenizer, pad_token_label_id, prefix=""):
     return results
 
 
-def load_and_cache_examples(args, tokenizer, pad_token_label_id, evaluate=False):
+def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, evaluate=False):
     if args.local_rank not in [-1, 0] and not evaluate:
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
@@ -258,9 +257,8 @@ def load_and_cache_examples(args, tokenizer, pad_token_label_id, evaluate=False)
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
-        label_list = get_labels()
         examples = read_examples_from_file(args.data_dir, evaluate=evaluate)
-        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer,
+        features = convert_examples_to_features(examples, labels, args.max_seq_length, tokenizer,
                                                 cls_token_at_end=bool(args.model_type in ["xlnet"]),
                                                 # xlnet has a cls token at the end
                                                 cls_token=tokenizer.cls_token,
@@ -305,6 +303,8 @@ def main():
                         help="The output directory where the model predictions and checkpoints will be written.")
 
     ## Other parameters
+    parser.add_argument("--labels", default="", type=str,
+                        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
     parser.add_argument("--config_name", default="", type=str,
                         help="Pretrained config name or path if not the same as model_name")
     parser.add_argument("--tokenizer_name", default="", type=str,
@@ -406,8 +406,8 @@ def main():
     set_seed(args)
 
     # Prepare CONLL-2003 task
-    label_list = get_labels()
-    num_labels = len(label_list)
+    labels = get_labels(args.labels)
+    num_labels = len(labels)
     # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
     pad_token_label_id = CrossEntropyLoss().ignore_index
 
@@ -433,8 +433,8 @@ def main():
 
     # Training
     if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, pad_token_label_id, evaluate=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer, pad_token_label_id)
+        train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
@@ -466,7 +466,7 @@ def main():
             global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
-            result = evaluate(args, model, tokenizer, pad_token_label_id, prefix=global_step)
+            result = evaluate(args, model, tokenizer, labels, pad_token_label_id, prefix=global_step)
             if global_step:
                 result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
             results.update(result)
diff --git a/examples/utils_ner.py b/examples/utils_ner.py
index 39f6d08149..27f76d5a59 100644
--- a/examples/utils_ner.py
+++ b/examples/utils_ner.py
@@ -202,5 +202,12 @@ def convert_examples_to_features(examples,
     return features
 
 
-def get_labels():
-    return ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
+def get_labels(path):
+    if path:
+        with open(path, "r") as f:
+            labels = f.read().splitlines()
+        if "O" not in labels:
+            labels = ["O"] + labels
+        return labels
+    else:
+        return ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

From 5adb39e757183a00b946d3b0571e1983fd0e26b7 Mon Sep 17 00:00:00 2001
From: Marianne Stecklina <marianne@omnius.com>
Date: Mon, 23 Sep 2019 10:51:54 +0200
Subject: [PATCH 049/144] Add option to predict on test set

---
 examples/run_ner.py   | 46 ++++++++++++++++++++++++++++++++++---------
 examples/utils_ner.py | 19 +++++++++---------
 2 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/examples/run_ner.py b/examples/run_ner.py
index f51f5ae2a1..6c6b0f8336 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -148,7 +148,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
                     if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer, labels, pad_token_label_id)
+                        results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id)
                         for key, value in results.items():
                             tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                     tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
@@ -178,8 +178,8 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
     return global_step, tr_loss / global_step
 
 
-def evaluate(args, model, tokenizer, labels, pad_token_label_id, prefix=""):
-    eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, evaluate=True)
+def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""):
+    eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode)
 
     args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
     # Note that DistributedSampler samples randomly
@@ -241,15 +241,15 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, prefix=""):
     for key in sorted(results.keys()):
         logger.info("  %s = %s", key, str(results[key]))
 
-    return results
+    return results, preds_list
 
 
-def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, evaluate=False):
+def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
     if args.local_rank not in [-1, 0] and not evaluate:
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
     # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format("dev" if evaluate else "train",
+    cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format(mode,
         list(filter(None, args.model_name_or_path.split("/"))).pop(),
         str(args.max_seq_length)))
     if os.path.exists(cached_features_file):
@@ -257,7 +257,7 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, evaluat
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
-        examples = read_examples_from_file(args.data_dir, evaluate=evaluate)
+        examples = read_examples_from_file(args.data_dir, mode)
         features = convert_examples_to_features(examples, labels, args.max_seq_length, tokenizer,
                                                 cls_token_at_end=bool(args.model_type in ["xlnet"]),
                                                 # xlnet has a cls token at the end
@@ -318,6 +318,8 @@ def main():
                         help="Whether to run training.")
     parser.add_argument("--do_eval", action="store_true",
                         help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_predict", action="store_true",
+                        help="Whether to run predictions on the test set.")
     parser.add_argument("--evaluate_during_training", action="store_true",
                         help="Whether to run evaluation during training at each logging step.")
     parser.add_argument("--do_lower_case", action="store_true",
@@ -433,7 +435,7 @@ def main():
 
     # Training
     if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, evaluate=False)
+        train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train")
         global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
@@ -466,7 +468,7 @@ def main():
             global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
-            result = evaluate(args, model, tokenizer, labels, pad_token_label_id, prefix=global_step)
+            result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step)
             if global_step:
                 result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
             results.update(result)
@@ -475,6 +477,32 @@ def main():
             for key in sorted(results.keys()):
                 writer.write("{} = {}\n".format(key, str(results[key])))
 
+    if args.do_predict and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        model = model_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+        result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
+        # Save results
+        output_test_results_file = os.path.join(args.output_dir, "test_results.txt")
+        with open(output_test_results_file, "w") as writer:
+            for key in sorted(result.keys()):
+                writer.write("{} = {}\n".format(key, str(result[key])))
+        # Save predictions
+        output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt")
+        with open(output_test_predictions_file, "w") as writer:
+            with open(os.path.join(args.data_dir, "test.txt"), "r") as f:
+                example_id = 0
+                for line in f:
+                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
+                        writer.write(line)
+                        if not predictions[example_id]:
+                            example_id += 1
+                    elif predictions[example_id]:
+                        output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n"
+                        writer.write(output_line)
+                    else:
+                        logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
+
     return results
 
 
diff --git a/examples/utils_ner.py b/examples/utils_ner.py
index 27f76d5a59..c20d7b0d1f 100644
--- a/examples/utils_ner.py
+++ b/examples/utils_ner.py
@@ -51,13 +51,8 @@ class InputFeatures(object):
         self.label_ids = label_ids
 
 
-def read_examples_from_file(data_dir, evaluate=False):
-    if evaluate:
-        file_path = os.path.join(data_dir, "dev.txt")
-        guid_prefix = "dev"
-    else:
-        file_path = os.path.join(data_dir, "train.txt")
-        guid_prefix = "train"
+def read_examples_from_file(data_dir, mode):
+    file_path = os.path.join(data_dir, "{}.txt".format(mode))
     guid_index = 1
     examples = []
     with open(file_path, encoding="utf-8") as f:
@@ -66,7 +61,7 @@ def read_examples_from_file(data_dir, evaluate=False):
         for line in f:
             if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                 if words:
-                    examples.append(InputExample(guid="{}-{}".format(guid_prefix, guid_index),
+                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index),
                                                  words=words,
                                                  labels=labels))
                     guid_index += 1
@@ -75,9 +70,13 @@ def read_examples_from_file(data_dir, evaluate=False):
             else:
                 splits = line.split(" ")
                 words.append(splits[0])
-                labels.append(splits[-1].replace("\n", ""))
+                if len(splits) > 1:
+                    labels.append(splits[-1].replace("\n", ""))
+                else:
+                    # Examples could have no label for mode = "test"
+                    labels.append("O")
         if words:
-            examples.append(InputExample(guid="%s-%d".format(guid_prefix, guid_index),
+            examples.append(InputExample(guid="%s-%d".format(mode, guid_index),
                                          words=words,
                                          labels=labels))
     return examples

From 383ef9674736ed6c97296ab7e2d2f05b2c41f3eb Mon Sep 17 00:00:00 2001
From: Marianne Stecklina <marianne@omnius.com>
Date: Tue, 17 Sep 2019 15:18:57 +0200
Subject: [PATCH 050/144] Implement fine-tuning BERT on CoNLL-2003 named entity
 recognition task

---
 examples/run_ner.py   | 64 ++++++++++++-------------------------------
 examples/utils_ner.py | 30 ++++++++------------
 2 files changed, 30 insertions(+), 64 deletions(-)

diff --git a/examples/run_ner.py b/examples/run_ner.py
index 6c6b0f8336..ce048ade18 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -55,7 +55,7 @@ def set_seed(args):
         torch.cuda.manual_seed_all(args.seed)
 
 
-def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
+def train(args, train_dataset, model, tokenizer, pad_token_label_id):
     """ Train the model """
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
@@ -148,7 +148,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
                     if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
-                        results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id)
+                        results = evaluate(args, model, tokenizer, pad_token_label_id)
                         for key, value in results.items():
                             tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                     tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
@@ -160,7 +160,8 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
                     output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
+                    model_to_save = model.module if hasattr(model,
+                                                            "module") else model  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
@@ -178,8 +179,8 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
     return global_step, tr_loss / global_step
 
 
-def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""):
-    eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode)
+def evaluate(args, model, tokenizer, pad_token_label_id, prefix=""):
+    eval_dataset = load_and_cache_examples(args, tokenizer, pad_token_label_id, evaluate=True)
 
     args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
     # Note that DistributedSampler samples randomly
@@ -219,7 +220,7 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
     eval_loss = eval_loss / nb_eval_steps
     preds = np.argmax(preds, axis=2)
 
-    label_map = {i: label for i, label in enumerate(labels)}
+    label_map = {i: label for i, label in enumerate(get_labels())}
 
     out_label_list = [[] for _ in range(out_label_ids.shape[0])]
     preds_list = [[] for _ in range(out_label_ids.shape[0])]
@@ -241,15 +242,15 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
     for key in sorted(results.keys()):
         logger.info("  %s = %s", key, str(results[key]))
 
-    return results, preds_list
+    return results
 
 
-def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
+def load_and_cache_examples(args, tokenizer, pad_token_label_id, evaluate=False):
     if args.local_rank not in [-1, 0] and not evaluate:
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
     # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format(mode,
+    cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format("dev" if evaluate else "train",
         list(filter(None, args.model_name_or_path.split("/"))).pop(),
         str(args.max_seq_length)))
     if os.path.exists(cached_features_file):
@@ -257,8 +258,9 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
-        examples = read_examples_from_file(args.data_dir, mode)
-        features = convert_examples_to_features(examples, labels, args.max_seq_length, tokenizer,
+        label_list = get_labels()
+        examples = read_examples_from_file(args.data_dir, evaluate=evaluate)
+        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer,
                                                 cls_token_at_end=bool(args.model_type in ["xlnet"]),
                                                 # xlnet has a cls token at the end
                                                 cls_token=tokenizer.cls_token,
@@ -303,8 +305,6 @@ def main():
                         help="The output directory where the model predictions and checkpoints will be written.")
 
     ## Other parameters
-    parser.add_argument("--labels", default="", type=str,
-                        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
     parser.add_argument("--config_name", default="", type=str,
                         help="Pretrained config name or path if not the same as model_name")
     parser.add_argument("--tokenizer_name", default="", type=str,
@@ -318,8 +318,6 @@ def main():
                         help="Whether to run training.")
     parser.add_argument("--do_eval", action="store_true",
                         help="Whether to run eval on the dev set.")
-    parser.add_argument("--do_predict", action="store_true",
-                        help="Whether to run predictions on the test set.")
     parser.add_argument("--evaluate_during_training", action="store_true",
                         help="Whether to run evaluation during training at each logging step.")
     parser.add_argument("--do_lower_case", action="store_true",
@@ -408,8 +406,8 @@ def main():
     set_seed(args)
 
     # Prepare CONLL-2003 task
-    labels = get_labels(args.labels)
-    num_labels = len(labels)
+    label_list = get_labels()
+    num_labels = len(label_list)
     # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
     pad_token_label_id = CrossEntropyLoss().ignore_index
 
@@ -435,8 +433,8 @@ def main():
 
     # Training
     if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train")
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
+        train_dataset = load_and_cache_examples(args, tokenizer, pad_token_label_id, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer, pad_token_label_id)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
@@ -468,7 +466,7 @@ def main():
             global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
-            result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step)
+            result = evaluate(args, model, tokenizer, pad_token_label_id, prefix=global_step)
             if global_step:
                 result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
             results.update(result)
@@ -477,32 +475,6 @@ def main():
             for key in sorted(results.keys()):
                 writer.write("{} = {}\n".format(key, str(results[key])))
 
-    if args.do_predict and args.local_rank in [-1, 0]:
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
-        model = model_class.from_pretrained(args.output_dir)
-        model.to(args.device)
-        result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
-        # Save results
-        output_test_results_file = os.path.join(args.output_dir, "test_results.txt")
-        with open(output_test_results_file, "w") as writer:
-            for key in sorted(result.keys()):
-                writer.write("{} = {}\n".format(key, str(result[key])))
-        # Save predictions
-        output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt")
-        with open(output_test_predictions_file, "w") as writer:
-            with open(os.path.join(args.data_dir, "test.txt"), "r") as f:
-                example_id = 0
-                for line in f:
-                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
-                        writer.write(line)
-                        if not predictions[example_id]:
-                            example_id += 1
-                    elif predictions[example_id]:
-                        output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n"
-                        writer.write(output_line)
-                    else:
-                        logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
-
     return results
 
 
diff --git a/examples/utils_ner.py b/examples/utils_ner.py
index c20d7b0d1f..0d3af3e061 100644
--- a/examples/utils_ner.py
+++ b/examples/utils_ner.py
@@ -51,8 +51,13 @@ class InputFeatures(object):
         self.label_ids = label_ids
 
 
-def read_examples_from_file(data_dir, mode):
-    file_path = os.path.join(data_dir, "{}.txt".format(mode))
+def read_examples_from_file(data_dir, evaluate=False):
+    if evaluate:
+        file_path = os.path.join(data_dir, "dev.txt")
+        guid_prefix = "dev"
+    else:
+        file_path = os.path.join(data_dir, "train.txt")
+        guid_prefix = "train"
     guid_index = 1
     examples = []
     with open(file_path, encoding="utf-8") as f:
@@ -61,7 +66,7 @@ def read_examples_from_file(data_dir, mode):
         for line in f:
             if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                 if words:
-                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index),
+                    examples.append(InputExample(guid="{}-{}".format(guid_prefix, guid_index),
                                                  words=words,
                                                  labels=labels))
                     guid_index += 1
@@ -70,13 +75,9 @@ def read_examples_from_file(data_dir, mode):
             else:
                 splits = line.split(" ")
                 words.append(splits[0])
-                if len(splits) > 1:
-                    labels.append(splits[-1].replace("\n", ""))
-                else:
-                    # Examples could have no label for mode = "test"
-                    labels.append("O")
+                labels.append(splits[-1][:-1])
         if words:
-            examples.append(InputExample(guid="%s-%d".format(mode, guid_index),
+            examples.append(InputExample(guid="%s-%d".format(guid_prefix, guid_index),
                                          words=words,
                                          labels=labels))
     return examples
@@ -201,12 +202,5 @@ def convert_examples_to_features(examples,
     return features
 
 
-def get_labels(path):
-    if path:
-        with open(path, "r") as f:
-            labels = f.read().splitlines()
-        if "O" not in labels:
-            labels = ["O"] + labels
-        return labels
-    else:
-        return ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
+def get_labels():
+    return ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

From e1d4179b64b81178d79639bfe03e2c551313abb4 Mon Sep 17 00:00:00 2001
From: Marianne Stecklina <marianne@omnius.com>
Date: Thu, 19 Sep 2019 09:28:00 +0200
Subject: [PATCH 051/144] Make file reading more robust

---
 examples/utils_ner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/utils_ner.py b/examples/utils_ner.py
index 0d3af3e061..39f6d08149 100644
--- a/examples/utils_ner.py
+++ b/examples/utils_ner.py
@@ -75,7 +75,7 @@ def read_examples_from_file(data_dir, evaluate=False):
             else:
                 splits = line.split(" ")
                 words.append(splits[0])
-                labels.append(splits[-1][:-1])
+                labels.append(splits[-1].replace("\n", ""))
         if words:
             examples.append(InputExample(guid="%s-%d".format(guid_prefix, guid_index),
                                          words=words,

From 7f5367e0b18a56448dde3c4504278e57e6f4beae Mon Sep 17 00:00:00 2001
From: Marianne Stecklina <marianne@omnius.com>
Date: Thu, 19 Sep 2019 11:29:20 +0200
Subject: [PATCH 052/144] Add cli argument for configuring labels

---
 examples/run_ner.py   | 30 +++++++++++++++---------------
 examples/utils_ner.py | 11 +++++++++--
 2 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/examples/run_ner.py b/examples/run_ner.py
index ce048ade18..f51f5ae2a1 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -55,7 +55,7 @@ def set_seed(args):
         torch.cuda.manual_seed_all(args.seed)
 
 
-def train(args, train_dataset, model, tokenizer, pad_token_label_id):
+def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
     """ Train the model """
     if args.local_rank in [-1, 0]:
         tb_writer = SummaryWriter()
@@ -148,7 +148,7 @@ def train(args, train_dataset, model, tokenizer, pad_token_label_id):
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
                     if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer, pad_token_label_id)
+                        results = evaluate(args, model, tokenizer, labels, pad_token_label_id)
                         for key, value in results.items():
                             tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                     tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
@@ -160,8 +160,7 @@ def train(args, train_dataset, model, tokenizer, pad_token_label_id):
                     output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                     if not os.path.exists(output_dir):
                         os.makedirs(output_dir)
-                    model_to_save = model.module if hasattr(model,
-                                                            "module") else model  # Take care of distributed/parallel training
+                    model_to_save = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
                     model_to_save.save_pretrained(output_dir)
                     torch.save(args, os.path.join(output_dir, "training_args.bin"))
                     logger.info("Saving model checkpoint to %s", output_dir)
@@ -179,8 +178,8 @@ def train(args, train_dataset, model, tokenizer, pad_token_label_id):
     return global_step, tr_loss / global_step
 
 
-def evaluate(args, model, tokenizer, pad_token_label_id, prefix=""):
-    eval_dataset = load_and_cache_examples(args, tokenizer, pad_token_label_id, evaluate=True)
+def evaluate(args, model, tokenizer, labels, pad_token_label_id, prefix=""):
+    eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, evaluate=True)
 
     args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
     # Note that DistributedSampler samples randomly
@@ -220,7 +219,7 @@ def evaluate(args, model, tokenizer, pad_token_label_id, prefix=""):
     eval_loss = eval_loss / nb_eval_steps
     preds = np.argmax(preds, axis=2)
 
-    label_map = {i: label for i, label in enumerate(get_labels())}
+    label_map = {i: label for i, label in enumerate(labels)}
 
     out_label_list = [[] for _ in range(out_label_ids.shape[0])]
     preds_list = [[] for _ in range(out_label_ids.shape[0])]
@@ -245,7 +244,7 @@ def evaluate(args, model, tokenizer, pad_token_label_id, prefix=""):
     return results
 
 
-def load_and_cache_examples(args, tokenizer, pad_token_label_id, evaluate=False):
+def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, evaluate=False):
     if args.local_rank not in [-1, 0] and not evaluate:
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
@@ -258,9 +257,8 @@ def load_and_cache_examples(args, tokenizer, pad_token_label_id, evaluate=False)
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
-        label_list = get_labels()
         examples = read_examples_from_file(args.data_dir, evaluate=evaluate)
-        features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer,
+        features = convert_examples_to_features(examples, labels, args.max_seq_length, tokenizer,
                                                 cls_token_at_end=bool(args.model_type in ["xlnet"]),
                                                 # xlnet has a cls token at the end
                                                 cls_token=tokenizer.cls_token,
@@ -305,6 +303,8 @@ def main():
                         help="The output directory where the model predictions and checkpoints will be written.")
 
     ## Other parameters
+    parser.add_argument("--labels", default="", type=str,
+                        help="Path to a file containing all labels. If not specified, CoNLL-2003 labels are used.")
     parser.add_argument("--config_name", default="", type=str,
                         help="Pretrained config name or path if not the same as model_name")
     parser.add_argument("--tokenizer_name", default="", type=str,
@@ -406,8 +406,8 @@ def main():
     set_seed(args)
 
     # Prepare CONLL-2003 task
-    label_list = get_labels()
-    num_labels = len(label_list)
+    labels = get_labels(args.labels)
+    num_labels = len(labels)
     # Use cross entropy ignore index as padding label id so that only real label ids contribute to the loss later
     pad_token_label_id = CrossEntropyLoss().ignore_index
 
@@ -433,8 +433,8 @@ def main():
 
     # Training
     if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, pad_token_label_id, evaluate=False)
-        global_step, tr_loss = train(args, train_dataset, model, tokenizer, pad_token_label_id)
+        train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, evaluate=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
     # Saving best-practices: if you use defaults names for the model, you can reload it using from_pretrained()
@@ -466,7 +466,7 @@ def main():
             global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
-            result = evaluate(args, model, tokenizer, pad_token_label_id, prefix=global_step)
+            result = evaluate(args, model, tokenizer, labels, pad_token_label_id, prefix=global_step)
             if global_step:
                 result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
             results.update(result)
diff --git a/examples/utils_ner.py b/examples/utils_ner.py
index 39f6d08149..27f76d5a59 100644
--- a/examples/utils_ner.py
+++ b/examples/utils_ner.py
@@ -202,5 +202,12 @@ def convert_examples_to_features(examples,
     return features
 
 
-def get_labels():
-    return ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]
+def get_labels(path):
+    if path:
+        with open(path, "r") as f:
+            labels = f.read().splitlines()
+        if "O" not in labels:
+            labels = ["O"] + labels
+        return labels
+    else:
+        return ["O", "B-MISC", "I-MISC",  "B-PER", "I-PER", "B-ORG", "I-ORG", "B-LOC", "I-LOC"]

From 5ff9cd158a08f6bcfa5c635c0a2eb6d79e4ef9c2 Mon Sep 17 00:00:00 2001
From: Marianne Stecklina <marianne@omnius.com>
Date: Mon, 23 Sep 2019 10:51:54 +0200
Subject: [PATCH 053/144] Add option to predict on test set

---
 examples/run_ner.py   | 46 ++++++++++++++++++++++++++++++++++---------
 examples/utils_ner.py | 19 +++++++++---------
 2 files changed, 46 insertions(+), 19 deletions(-)

diff --git a/examples/run_ner.py b/examples/run_ner.py
index f51f5ae2a1..6c6b0f8336 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -148,7 +148,7 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
                 if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                     # Log metrics
                     if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
-                        results = evaluate(args, model, tokenizer, labels, pad_token_label_id)
+                        results, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id)
                         for key, value in results.items():
                             tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                     tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
@@ -178,8 +178,8 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
     return global_step, tr_loss / global_step
 
 
-def evaluate(args, model, tokenizer, labels, pad_token_label_id, prefix=""):
-    eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, evaluate=True)
+def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""):
+    eval_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode=mode)
 
     args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
     # Note that DistributedSampler samples randomly
@@ -241,15 +241,15 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, prefix=""):
     for key in sorted(results.keys()):
         logger.info("  %s = %s", key, str(results[key]))
 
-    return results
+    return results, preds_list
 
 
-def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, evaluate=False):
+def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
     if args.local_rank not in [-1, 0] and not evaluate:
         torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
 
     # Load data features from cache or dataset file
-    cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format("dev" if evaluate else "train",
+    cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format(mode,
         list(filter(None, args.model_name_or_path.split("/"))).pop(),
         str(args.max_seq_length)))
     if os.path.exists(cached_features_file):
@@ -257,7 +257,7 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, evaluat
         features = torch.load(cached_features_file)
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
-        examples = read_examples_from_file(args.data_dir, evaluate=evaluate)
+        examples = read_examples_from_file(args.data_dir, mode)
         features = convert_examples_to_features(examples, labels, args.max_seq_length, tokenizer,
                                                 cls_token_at_end=bool(args.model_type in ["xlnet"]),
                                                 # xlnet has a cls token at the end
@@ -318,6 +318,8 @@ def main():
                         help="Whether to run training.")
     parser.add_argument("--do_eval", action="store_true",
                         help="Whether to run eval on the dev set.")
+    parser.add_argument("--do_predict", action="store_true",
+                        help="Whether to run predictions on the test set.")
     parser.add_argument("--evaluate_during_training", action="store_true",
                         help="Whether to run evaluation during training at each logging step.")
     parser.add_argument("--do_lower_case", action="store_true",
@@ -433,7 +435,7 @@ def main():
 
     # Training
     if args.do_train:
-        train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, evaluate=False)
+        train_dataset = load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode="train")
         global_step, tr_loss = train(args, train_dataset, model, tokenizer, labels, pad_token_label_id)
         logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
@@ -466,7 +468,7 @@ def main():
             global_step = checkpoint.split("-")[-1] if len(checkpoints) > 1 else ""
             model = model_class.from_pretrained(checkpoint)
             model.to(args.device)
-            result = evaluate(args, model, tokenizer, labels, pad_token_label_id, prefix=global_step)
+            result, _ = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="dev", prefix=global_step)
             if global_step:
                 result = {"{}_{}".format(global_step, k): v for k, v in result.items()}
             results.update(result)
@@ -475,6 +477,32 @@ def main():
             for key in sorted(results.keys()):
                 writer.write("{} = {}\n".format(key, str(results[key])))
 
+    if args.do_predict and args.local_rank in [-1, 0]:
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        model = model_class.from_pretrained(args.output_dir)
+        model.to(args.device)
+        result, predictions = evaluate(args, model, tokenizer, labels, pad_token_label_id, mode="test")
+        # Save results
+        output_test_results_file = os.path.join(args.output_dir, "test_results.txt")
+        with open(output_test_results_file, "w") as writer:
+            for key in sorted(result.keys()):
+                writer.write("{} = {}\n".format(key, str(result[key])))
+        # Save predictions
+        output_test_predictions_file = os.path.join(args.output_dir, "test_predictions.txt")
+        with open(output_test_predictions_file, "w") as writer:
+            with open(os.path.join(args.data_dir, "test.txt"), "r") as f:
+                example_id = 0
+                for line in f:
+                    if line.startswith("-DOCSTART-") or line == "" or line == "\n":
+                        writer.write(line)
+                        if not predictions[example_id]:
+                            example_id += 1
+                    elif predictions[example_id]:
+                        output_line = line.split()[0] + " " + predictions[example_id].pop(0) + "\n"
+                        writer.write(output_line)
+                    else:
+                        logger.warning("Maximum sequence length exceeded: No prediction for '%s'.", line.split()[0])
+
     return results
 
 
diff --git a/examples/utils_ner.py b/examples/utils_ner.py
index 27f76d5a59..c20d7b0d1f 100644
--- a/examples/utils_ner.py
+++ b/examples/utils_ner.py
@@ -51,13 +51,8 @@ class InputFeatures(object):
         self.label_ids = label_ids
 
 
-def read_examples_from_file(data_dir, evaluate=False):
-    if evaluate:
-        file_path = os.path.join(data_dir, "dev.txt")
-        guid_prefix = "dev"
-    else:
-        file_path = os.path.join(data_dir, "train.txt")
-        guid_prefix = "train"
+def read_examples_from_file(data_dir, mode):
+    file_path = os.path.join(data_dir, "{}.txt".format(mode))
     guid_index = 1
     examples = []
     with open(file_path, encoding="utf-8") as f:
@@ -66,7 +61,7 @@ def read_examples_from_file(data_dir, evaluate=False):
         for line in f:
             if line.startswith("-DOCSTART-") or line == "" or line == "\n":
                 if words:
-                    examples.append(InputExample(guid="{}-{}".format(guid_prefix, guid_index),
+                    examples.append(InputExample(guid="{}-{}".format(mode, guid_index),
                                                  words=words,
                                                  labels=labels))
                     guid_index += 1
@@ -75,9 +70,13 @@ def read_examples_from_file(data_dir, evaluate=False):
             else:
                 splits = line.split(" ")
                 words.append(splits[0])
-                labels.append(splits[-1].replace("\n", ""))
+                if len(splits) > 1:
+                    labels.append(splits[-1].replace("\n", ""))
+                else:
+                    # Examples could have no label for mode = "test"
+                    labels.append("O")
         if words:
-            examples.append(InputExample(guid="%s-%d".format(guid_prefix, guid_index),
+            examples.append(InputExample(guid="%s-%d".format(mode, guid_index),
                                          words=words,
                                          labels=labels))
     return examples

From 66adb71734d27575678e3a67cf1b70d871d0aac1 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 3 Oct 2019 16:54:40 -0400
Subject: [PATCH 054/144] update to transformers

---
 examples/run_ner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/run_ner.py b/examples/run_ner.py
index 6c6b0f8336..0e40ad02a6 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -33,8 +33,8 @@ from torch.utils.data.distributed import DistributedSampler
 from tqdm import tqdm, trange
 from utils_ner import convert_examples_to_features, get_labels, read_examples_from_file
 
-from pytorch_transformers import AdamW, WarmupLinearSchedule
-from pytorch_transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
+from transformers import AdamW, WarmupLinearSchedule
+from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
 
 logger = logging.getLogger(__name__)
 

From 0f9ebb0b43e825afd4d2dea2484b75704c3b6794 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 3 Oct 2019 16:54:52 -0400
Subject: [PATCH 055/144] add seqeval as requirement for examples

---
 examples/requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/requirements.txt b/examples/requirements.txt
index 42abe8933c..b44e86176e 100644
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@@ -1,2 +1,3 @@
 tensorboardX
-scikit-learn
\ No newline at end of file
+scikit-learn
+seqeval
\ No newline at end of file

From 788e632622b27f6665e8e85ae23f3f93552a1dd7 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Fri, 11 Oct 2019 18:04:29 -0400
Subject: [PATCH 056/144] [ner] Honor args.overwrite_cache

---
 examples/run_ner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_ner.py b/examples/run_ner.py
index 0e40ad02a6..fdf2f1924a 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -252,7 +252,7 @@ def load_and_cache_examples(args, tokenizer, labels, pad_token_label_id, mode):
     cached_features_file = os.path.join(args.data_dir, "cached_{}_{}_{}".format(mode,
         list(filter(None, args.model_name_or_path.split("/"))).pop(),
         str(args.max_seq_length)))
-    if os.path.exists(cached_features_file):
+    if os.path.exists(cached_features_file) and not args.overwrite_cache:
         logger.info("Loading features from cached file %s", cached_features_file)
         features = torch.load(cached_features_file)
     else:

From c55badcee0c702f184aee2c85a0146c8804cc141 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 15 Oct 2019 09:33:52 +0200
Subject: [PATCH 057/144] Add NER finetuning details by @stefan-it in example
 readme

---
 examples/README.md | 103 ++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 102 insertions(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index 382d794fcb..806601f9f3 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -8,8 +8,9 @@ similar API between the different models.
 | [Language Model fine-tuning](#language-model-fine-tuning) | Fine-tuning the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
 | [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.                                         |
 | [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
-| [SQuAD](#squad) | Using BERT for question answering, examples with distributed training.                                                                                  |
+| [SQuAD](#squad) | Using BERT/XLM/XLNet/RoBERTa for question answering, examples with distributed training.                                                                                  |
 | [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
+| [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training.                                                                                  |
 
 ## Language model fine-tuning
 
@@ -390,3 +391,103 @@ exact_match = 86.91
 This fine-tuneds model is available as a checkpoint under the reference
 `bert-large-uncased-whole-word-masking-finetuned-squad`.
 
+## Named Entity Recognition
+
+Based on the script [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py).
+This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
+Details and results for the fine-tuning provided by @stefan-it.
+
+### Data (Download and pre-processing steps)
+
+Data can be obtained from the [GermEval 2014](https://sites.google.com/site/germeval2014ner/data) shared task page.
+
+Here are the commands for downloading and pre-processing train, dev and test datasets. The original data format has four (tab-separated) columns, in a pre-processing step only the two relevant columns (token and outer span NER annotation) are extracted:
+
+```bash
+curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-train.tsv?attredirects=0&d=1' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > train.txt.tmp
+curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-dev.tsv?attredirects=0&d=1' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > dev.txt.tmp
+curl -L 'https://sites.google.com/site/germeval2014ner/data/NER-de-test.tsv?attredirects=0&d=1' \
+| grep -v "^#" | cut -f 2,3 | tr '\t' ' ' > test.txt.tmp
+```
+
+The GermEval 2014 dataset contains some strange "control character" tokens like `'\x96', '\u200e', '\x95', '\xad' or '\x80'`. One problem with these tokens is, that `BertTokenizer` returns an empty token for them, resulting in misaligned `InputExample`s. I wrote a script that a) filters these tokens and b) splits longer sentences into smaller ones (once the max. subtoken length is reached).
+
+```bash
+wget "https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py"
+```
+Let's define some variables that we need for further pre-processing steps and training the model:
+
+```bash
+export MAX_LENGTH=128
+export BERT_MODEL=bert-base-multilingual-cased
+```
+
+Run the pre-processing script on training, dev and test datasets:
+
+```bash
+python3 preprocess.py train.txt.tmp $BERT_MODEL $MAX_LENGTH > train.txt
+python3 preprocess.py dev.txt.tmp $BERT_MODEL $MAX_LENGTH > dev.txt
+python3 preprocess.py test.txt.tmp $BERT_MODEL $MAX_LENGTH > test.txt
+```
+
+The GermEval 2014 dataset has much more labels than CoNLL-2002/2003 datasets, so an own set of labels must be used:
+
+```bash
+cat train.txt dev.txt test.txt | cut -d " " -f 2 | grep -v "^$"| sort | uniq > labels.txt
+```
+
+### Training
+
+Additional environment variables must be set:
+
+```bash
+export OUTPUT_DIR=germeval-model
+export BATCH_SIZE=32
+export NUM_EPOCHS=3
+export SAVE_STEPS=750
+export SEED=1
+```
+
+To start training, just run:
+
+```bash
+python3 run_ner.py --data_dir ./ \
+--model_type bert \
+--labels ./labels.txt \
+--model_name_or_path $BERT_MODEL \
+--output_dir $OUTPUT_DIR \
+--max_seq_length  $MAX_LENGTH \
+--num_train_epochs $NUM_EPOCHS \
+--per_gpu_train_batch_size $BATCH_SIZE \
+--save_steps $SAVE_STEPS \
+--seed $SEED \
+--do_train \
+--do_eval \
+--do_predict
+```
+
+If your GPU supports half-precision training, just add the `--fp16` flag. After training, the model will be both evaluated on development and test datasets.
+
+### Evaluation
+
+Evaluation on development dataset outputs the following for our example:
+
+```bash
+10/04/2019 00:42:06 - INFO - __main__ -   ***** Eval results  *****
+10/04/2019 00:42:06 - INFO - __main__ -     f1 = 0.8623348017621146
+10/04/2019 00:42:06 - INFO - __main__ -     loss = 0.07183869666975543
+10/04/2019 00:42:06 - INFO - __main__ -     precision = 0.8467916366258111
+10/04/2019 00:42:06 - INFO - __main__ -     recall = 0.8784592370979806
+```
+
+On the test dataset the following results could be achieved:
+
+```bash
+10/04/2019 00:42:42 - INFO - __main__ -   ***** Eval results  *****
+10/04/2019 00:42:42 - INFO - __main__ -     f1 = 0.8614389652384803
+10/04/2019 00:42:42 - INFO - __main__ -     loss = 0.07064602487454782
+10/04/2019 00:42:42 - INFO - __main__ -     precision = 0.8604651162790697
+10/04/2019 00:42:42 - INFO - __main__ -     recall = 0.8624150210424085
+```

From 2c1d5564ad8e7d937bccf500a12e95423f4b6545 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 15 Oct 2019 09:56:52 +0200
Subject: [PATCH 058/144] add readme information

---
 examples/README.md | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

diff --git a/examples/README.md b/examples/README.md
index 382d794fcb..9465b9ad82 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -5,12 +5,35 @@ similar API between the different models.
 
 | Section                    | Description                                                                                                                                                |
 |----------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| [TensorFlow 2.0 models on GLUE](#TensorFlow-2.0-Bert-models-on-GLUE) | Examples running BERT TensorFlow 2.0 model on the GLUE tasks. 
 | [Language Model fine-tuning](#language-model-fine-tuning) | Fine-tuning the library models for language modeling on a text dataset. Causal language modeling for GPT/GPT-2, masked language modeling for BERT/RoBERTa. |
 | [Language Generation](#language-generation) | Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.                                         |
 | [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
 | [SQuAD](#squad) | Using BERT for question answering, examples with distributed training.                                                                                  |
 | [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
 
+## TensorFlow 2.0 Bert models on GLUE
+
+Based on the script [`run_tf_glue.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_glue.py).
+
+Fine-tuning the library TensorFlow 2.0 Bert model for sequence classification on the  MRPC task of the GLUE benchmark: [General Language Understanding Evaluation](https://gluebenchmark.com/).
+
+This script has an option for mixed precision (Automatic Mixed Precision / AMP) to run models on Tensor Cores (NVIDIA Volta/Turing GPUs) and future hardware and an option for XLA, which uses the XLA compiler to reduce model runtime.
+Options are toggled using `USE_XLA` or `USE_AMP` variables in the script.
+These options and the below benchmark are provided by @tlkh.
+
+Quick benchmarks from the script (no other modifications):
+
+| GPU    | Mode | Time (2nd epoch) | Val Acc (3 runs) |
+| --------- | -------- | ----------------------- | ----------------------|
+| Titan V | FP32 | 41s | 0.8438/0.8281/0.8333 |
+| Titan V | AMP | 26s | 0.8281/0.8568/0.8411 |
+| V100    | FP32 | 35s | 0.8646/0.8359/0.8464 |
+| V100    | AMP | 22s | 0.8646/0.8385/0.8411 |
+| 1080 Ti | FP32 | 55s | - | 
+
+Mixed precision (AMP) reduces the training time considerably for the same hardware and hyper-parameters (same batch size was used).
+
 ## Language model fine-tuning
 
 Based on the script [`run_lm_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_lm_finetuning.py).

From 898ce064f8c53b8744c51358d49eff51af0a8713 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 15 Oct 2019 10:04:19 +0200
Subject: [PATCH 059/144] add tests on TF2.0 & PT checkpoint => model
 convertion functions

---
 transformers/tests/modeling_tf_common_test.py | 23 ++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index 360f86ea69..f636c42889 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 from __future__ import absolute_import, division, print_function
 
+import os
 import copy
 import json
 import logging
@@ -118,7 +119,7 @@ class TFCommonTestCases:
                 tf_model = model_class(config)
                 pt_model = pt_model_class(config)
 
-                # Check we can load pt model in tf and vice-versa (architecture similar)
+                # Check we can load pt model in tf and vice-versa with model => model functions
                 tf_model = transformers.load_pytorch_model_in_tf2_model(tf_model, pt_model, tf_inputs=inputs_dict)
                 pt_model = transformers.load_tf2_model_in_pytorch_model(pt_model, tf_model)
 
@@ -132,6 +133,26 @@ class TFCommonTestCases:
                 max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
                 self.assertLessEqual(max_diff, 2e-2)
 
+                # Check we can load pt model in tf and vice-versa with checkpoint => model functions
+                with TemporaryDirectory() as tmpdirname:
+                    pt_checkpoint_path = os.path.join(tmpdirname, 'pt_model.bin')
+                    torch.save(pt_model.state_dict(), pt_checkpoint_path)
+                    tf_model = transformers.load_pytorch_checkpoint_in_tf2_model(tf_model, pt_checkpoint_path)
+
+                    tf_checkpoint_path = os.path.join(tmpdirname, 'tf_model.h5')
+                    tf_model.save_weights(tf_checkpoint_path)
+                    pt_model = transformers.load_tf2_checkpoint_in_pytorch_model(pt_model, tf_checkpoint_path)
+
+                # Check predictions on first output (logits/hidden-states) are close enought given low-level computational differences
+                pt_model.eval()
+                pt_inputs_dict = dict((name, torch.from_numpy(key.numpy()).to(torch.long))
+                                      for name, key in inputs_dict.items())
+                with torch.no_grad():
+                    pto = pt_model(**pt_inputs_dict)
+                tfo = tf_model(inputs_dict)
+                max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
+                self.assertLessEqual(max_diff, 2e-2)
+
         def test_compile_tf_model(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 

From 5875aaf7625e3fcd6c42313fb401f9b959b97b10 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 15 Oct 2019 10:36:46 +0200
Subject: [PATCH 060/144] install tensorboard

---
 examples/requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/examples/requirements.txt b/examples/requirements.txt
index 42abe8933c..cca63d7627 100644
--- a/examples/requirements.txt
+++ b/examples/requirements.txt
@@ -1,2 +1,3 @@
 tensorboardX
+tensorboard
 scikit-learn
\ No newline at end of file

From 260ac7d9a8501f6c631adc355e269e7f3f6274f6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Tue, 15 Oct 2019 12:24:35 +0200
Subject: [PATCH 061/144] wip commit, switching computers

---
 examples/run_seq2seq_finetuning.py      | 42 ++++++++--------
 examples/run_seq2seq_finetuning_test.py | 64 +++++++++++++++++++++++++
 2 files changed, 85 insertions(+), 21 deletions(-)
 create mode 100644 examples/run_seq2seq_finetuning_test.py

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index 5d7da58a23..1f247ab25b 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -31,7 +31,7 @@ Natural Language Understanding and Generation.” (May 2019) ArXiv:1905.03197
 """
 
 import argparse
-import dequeue
+from collections import deque
 import logging
 import pickle
 import random
@@ -57,9 +57,9 @@ class TextDataset(Dataset):
 
     CNN/Daily News:
 
-    The CNN/Daily News raw datasets are downloaded from [1]. They consist in stories stored
-    in different files where the summary sentences are indicated by the special `@highlight` token.
-    To process the data, untar both datasets in the same folder, and pass the path to this
+    The CNN/Daily News raw datasets are downloaded from [1]. The stories are stored in different files; the summary appears at the end of the story as
+    sentences that are prefixed by the special `@highlight` line. To process the
+    data, untar both datasets in the same folder, and pass the path to this
     folder as the "data_dir argument. The formatting code was inspired by [2].
 
     [1] https://cs.nyu.edu/~kcho/
@@ -69,7 +69,7 @@ class TextDataset(Dataset):
         assert os.path.isdir(data_dir)
 
         # Load features that have already been computed if present
-        cached_features_file = os.path.join(directory, "cached_lm_{}_{}".format(block_size, data_dir)
+        cached_features_file = os.path.join(directory, "cached_lm_{}_{}".format(block_size, data_dir))
         if os.path.exists(cached_features_file):
             logger.info("Loading features from cached file %s", cached_features_file)
             with open(cached_features_file, "rb") as source:
@@ -86,18 +86,19 @@ class TextDataset(Dataset):
             stories_files = os.listdir(path_to_stories)
             for story_file in stories_files:
                 path_to_story = os.path.join(path_to_stories, "story_file")
-                if !os.path.isfile(path_to_story):
+                if not os.path.isfile(path_to_story):
                     continue
 
                 with open(path_to_story, encoding="utf-8") as source:
                     try:
-                        story, summary = process_story(source)
+                        raw_story = source.read()
+                        story, summary = process_story(raw_story)
                     except IndexError:
                         continue
 
                 story = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(story))
                 summary = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(summary))
-                story_seq, summary_seq = _fit_to_block_size(story, summary, blocksize)
+                story_seq, summary_seq = _fit_to_block_size(story, summary, block_size)
                 example = tokenizer.add_special_token_sequence_pair(story_seq, summary_seq)
                 self.examples.append(example)
 
@@ -108,22 +109,22 @@ class TextDataset(Dataset):
     def __len__(self):
         return len(self.examples)
 
-    def __getitem__(self):
+    def __getitem__(self, items):
         return torch.tensor(self.examples[items])
 
 
-def process_story(story_file):
+def process_story(raw_story):
     """ Process the text contained in a story file.
     Returns the story and the summary
     """
-    file_lines = list(filter(lambda x: len(x)!=0, [line.strip() for lines in story_file]))
+    file_lines = list(filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")]))
 
     # for some unknown reason some lines miss a period, add it
     file_lines = [_add_missing_period(line) for line in file_lines]
 
     # gather article lines
     story_lines = []
-    lines = dequeue(file_lines)
+    lines = deque(file_lines)
     while True:
         try:
             element = lines.popleft()
@@ -134,7 +135,7 @@ def process_story(story_file):
             raise ie
 
     # gather summary lines
-    highlights_lines = list(filter(lambda t: !t.startswith("@highlight"), lines))
+    highlights_lines = list(filter(lambda t: not t.startswith("@highlight"), lines))
 
     # join the lines
     story = " ".join(story_lines)
@@ -145,7 +146,7 @@ def process_story(story_file):
 
 def _add_missing_period(line):
     END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', u'\u2019', u'\u2019', ")"]
-    if line == "@highlight":
+    if line.startswith("@highlight"):
         return line
     if line[-1] in END_TOKENS:
         return line
@@ -163,8 +164,8 @@ def _fit_to_block_size(src_sequence, tgt_sequence, block_size):
     [1] Dong, Li, et al. "Unified Language Model Pre-training for Natural
     Language Understanding and Generation." arXiv preprint arXiv:1905.03197 (2019).
     """
-    SRC_MAX_LENGTH = int(0.75 * block_size) - 2 # CLS and EOS token
-    TGT_MAX_LENGTH = block_size - SRC_MAX_LENGTH - 1 # EOS token
+    SRC_MAX_LENGTH = int(0.75 * block_size) - 2  # CLS and EOS token
+    TGT_MAX_LENGTH = block_size - SRC_MAX_LENGTH - 1  # EOS token
 
     # we dump the examples that are too small to fit in the block size for the
     # sake of simplicity. You can modify this by adding model-specific padding.
@@ -172,22 +173,21 @@ def _fit_to_block_size(src_sequence, tgt_sequence, block_size):
         return None
 
     # the source sequence has `[SEP_i]` special tokens with i \in [0,9]. We keep them for now.
-    if len(src_sequence) > SRC_MAX_LENGTH
+    if len(src_sequence) > SRC_MAX_LENGTH:
         if len(tgt_sequence) > TGT_MAX_LENGTH:
             src_sequence = src_sequence[:SRC_MAX_LENGTH]
             tgt_sequence = tgt_sequence[:TGT_MAX_LENGTH]
         else:
             src_sequence = src_sequence[block_size - len(tgt_sequence) - 3]
     else:
-        if len(tgt_tokens) > TGT_MAX_LENGTH:
+        if len(tgt_sequence) > TGT_MAX_LENGTH:
             tgt_sequence = tgt_sequence[block_size - len(src_sequence) - 3]
 
     return src_sequence, tgt_sequence
 
 
-
 def load_and_cache_examples(args, tokenizer):
-    dataset = TextDataset(tokenizer, file_path=args.train_data_file)
+    dataset = TextDataset(tokenizer, file_path=args.data_dir)
     return dataset
 
 
@@ -200,7 +200,7 @@ def main():
     parser = argparse.ArgumentParser()
 
     # Required parameters
-    parser.add_argument("--train_data_file",
+    parser.add_argument("--data_dir",
                         default=None,
                         type=str,
                         required=True,
diff --git a/examples/run_seq2seq_finetuning_test.py b/examples/run_seq2seq_finetuning_test.py
new file mode 100644
index 0000000000..34d9add10d
--- /dev/null
+++ b/examples/run_seq2seq_finetuning_test.py
@@ -0,0 +1,64 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+from .run_seq2seq_finetuning import process_story, _fit_to_block_size
+
+
+class DataLoaderTest(unittest.TestCase):
+    def __init__(self, block_size=10):
+        self.block_size = block_size
+
+    def source_and_target_too_small(self):
+        """ When the sum of the lengths of the source and target sequences is
+        smaller than the block size (minus the number of special tokens), skip the example. """
+        src_seq = [1, 2, 3, 4]
+        tgt_seq = [5, 6]
+        self.assertEqual(_fit_to_block_size(src_seq, tgt_seq, self.block_size), None)
+
+    def source_and_target_fit_exactly(self):
+        """ When the sum of the lengths of the source and target sequences is
+        equal to the block size (minus the number of special tokens), return the
+        sequences unchanged. """
+        src_seq = [1, 2, 3, 4]
+        tgt_seq = [5, 6, 7]
+        fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
+        self.assertListEqual(src_seq == fitted_src)
+        self.assertListEqual(tgt_seq == fitted_tgt)
+
+    def source_too_big_target_ok(self):
+        src_seq = [1, 2, 3, 4, 5, 6]
+        tgt_seq = [1, 2]
+        fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
+        self.assertListEqual(src_seq == [1, 2, 3, 4, 5])
+        self.assertListEqual(tgt_seq == fitted_tgt)
+
+    def target_too_big_source_ok(self):
+        src_seq = [1, 2, 3, 4]
+        tgt_seq = [1, 2, 3, 4]
+        fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
+        self.assertListEqual(src_seq == src_seq)
+        self.assertListEqual(tgt_seq == [1, 2, 3])
+
+    def source_and_target_too_big(self):
+        src_seq = [1, 2, 3, 4, 5, 6, 7]
+        tgt_seq = [1, 2, 3, 4, 5, 6, 7]
+        fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
+        self.assertListEqual(src_seq == [1, 2, 3, 4, 5])
+        self.assertListEqual(tgt_seq == [1, 2])
+
+
+if __name__ == "__main__":
+    unittest.main()

From 22e1af68596690558cd8df45b6bc75e665cc1c1b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi.louf@kering.com>
Date: Tue, 15 Oct 2019 14:39:56 +0200
Subject: [PATCH 062/144] truncation function is fully tested

---
 examples/run_seq2seq_finetuning.py      | 101 ++++++++++++++----------
 examples/run_seq2seq_finetuning_test.py |  32 ++++----
 2 files changed, 74 insertions(+), 59 deletions(-)

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index 1f247ab25b..e926523a17 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -41,7 +41,7 @@ import numpy as np
 import torch
 from torch.utils.data import Dataset
 
-from transformers import BertConfig, Bert2Rnd, BertTokenizer
+from transformers import BertTokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -57,19 +57,23 @@ class TextDataset(Dataset):
 
     CNN/Daily News:
 
-    The CNN/Daily News raw datasets are downloaded from [1]. The stories are stored in different files; the summary appears at the end of the story as
-    sentences that are prefixed by the special `@highlight` line. To process the
-    data, untar both datasets in the same folder, and pass the path to this
+    The CNN/Daily News raw datasets are downloaded from [1]. The stories are
+    stored in different files; the summary appears at the end of the story as
+    sentences that are prefixed by the special `@highlight` line. To process
+    the data, untar both datasets in the same folder, and pass the path to this
     folder as the "data_dir argument. The formatting code was inspired by [2].
 
     [1] https://cs.nyu.edu/~kcho/
     [2] https://github.com/abisee/cnn-dailymail/
     """
-    def __init_(self, tokenizer, data_dir='', block_size=512):
+
+    def __init_(self, tokenizer, data_dir="", block_size=512):
         assert os.path.isdir(data_dir)
 
         # Load features that have already been computed if present
-        cached_features_file = os.path.join(directory, "cached_lm_{}_{}".format(block_size, data_dir))
+        cached_features_file = os.path.join(
+            data_dir, "cached_lm_{}_{}".format(block_size, data_dir)
+        )
         if os.path.exists(cached_features_file):
             logger.info("Loading features from cached file %s", cached_features_file)
             with open(cached_features_file, "rb") as source:
@@ -78,7 +82,7 @@ class TextDataset(Dataset):
 
         logger.info("Creating features from dataset at %s", data_dir)
 
-        datasets = ['cnn', 'dailymail']
+        datasets = ["cnn", "dailymail"]
         for dataset in datasets:
             path_to_stories = os.path.join(data_dir, dataset, "stories")
             assert os.path.isdir(path_to_stories)
@@ -99,7 +103,9 @@ class TextDataset(Dataset):
                 story = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(story))
                 summary = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(summary))
                 story_seq, summary_seq = _fit_to_block_size(story, summary, block_size)
-                example = tokenizer.add_special_token_sequence_pair(story_seq, summary_seq)
+                example = tokenizer.add_special_token_sequence_pair(
+                    story_seq, summary_seq
+                )
                 self.examples.append(example)
 
         logger.info("Saving features into cache file %s", cached_features_file)
@@ -117,7 +123,9 @@ def process_story(raw_story):
     """ Process the text contained in a story file.
     Returns the story and the summary
     """
-    file_lines = list(filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")]))
+    file_lines = list(
+        filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])
+    )
 
     # for some unknown reason some lines miss a period, add it
     file_lines = [_add_missing_period(line) for line in file_lines]
@@ -145,7 +153,7 @@ def process_story(raw_story):
 
 
 def _add_missing_period(line):
-    END_TOKENS = ['.', '!', '?', '...', "'", "`", '"', u'\u2019', u'\u2019', ")"]
+    END_TOKENS = [".", "!", "?", "...", "'", "`", '"', u"\u2019", u"\u2019", ")"]
     if line.startswith("@highlight"):
         return line
     if line[-1] in END_TOKENS:
@@ -154,34 +162,35 @@ def _add_missing_period(line):
 
 
 def _fit_to_block_size(src_sequence, tgt_sequence, block_size):
-    """ Concatenate the sequences and adapt their lengths to the block size.
+    """ Adapt the source and target sequences' lengths to the block size.
 
-    Following [1] we truncate the source and target + tokens sequences so they fit
-    in the block size. If the concatenated sequence is longer than 512 we follow
-    the 75%/25% rule in [1]: limit the source sequence's length to 384 and the
-    target sequence's length to 128.
+    If the concatenated sequence (source + target + 3 special tokens) would be
+    longer than the block size we use the 75% / 25% rule followed in [1]. For a
+    block size of 512 this means limiting the source sequence's length to 384
+    and the target sequence's length to 128.
 
     [1] Dong, Li, et al. "Unified Language Model Pre-training for Natural
     Language Understanding and Generation." arXiv preprint arXiv:1905.03197 (2019).
     """
     SRC_MAX_LENGTH = int(0.75 * block_size) - 2  # CLS and EOS token
-    TGT_MAX_LENGTH = block_size - SRC_MAX_LENGTH - 1  # EOS token
+    TGT_MAX_LENGTH = block_size - (SRC_MAX_LENGTH + 2) - 1  # EOS token
 
-    # we dump the examples that are too small to fit in the block size for the
+    # We dump the examples that are too small to fit in the block size for the
     # sake of simplicity. You can modify this by adding model-specific padding.
-    if len(src_sequence) + len(src_sequence) + 3 < block_size:
+    if len(src_sequence) + len(tgt_sequence) + 3 < block_size:
         return None
 
-    # the source sequence has `[SEP_i]` special tokens with i \in [0,9]. We keep them for now.
     if len(src_sequence) > SRC_MAX_LENGTH:
         if len(tgt_sequence) > TGT_MAX_LENGTH:
             src_sequence = src_sequence[:SRC_MAX_LENGTH]
             tgt_sequence = tgt_sequence[:TGT_MAX_LENGTH]
         else:
-            src_sequence = src_sequence[block_size - len(tgt_sequence) - 3]
+            remain_size = block_size - len(tgt_sequence) - 3
+            src_sequence = src_sequence[:remain_size]
     else:
         if len(tgt_sequence) > TGT_MAX_LENGTH:
-            tgt_sequence = tgt_sequence[block_size - len(src_sequence) - 3]
+            remain_size = block_size - len(src_sequence) - 3
+            tgt_sequence = tgt_sequence[:remain_size]
 
     return src_sequence, tgt_sequence
 
@@ -200,44 +209,50 @@ def main():
     parser = argparse.ArgumentParser()
 
     # Required parameters
-    parser.add_argument("--data_dir",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The input training data file (a text file).")
-    parser.add_argument("--output_dir",
-                        default=None,
-                        type=str,
-                        required=True,
-                        help="The output directory where the model predictions and checkpoints will be written.")
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input training data file (a text file).",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
 
     # Optional parameters
-    parser.add_argument("--model_name_or_path",
-                        default="bert-base-cased",
-                        type=str,
-                        help="The model checkpoint for weights initialization.")
+    parser.add_argument(
+        "--model_name_or_path",
+        default="bert-base-cased",
+        type=str,
+        help="The model checkpoint for weights initialization.",
+    )
     parser.add_argument("--seed", default=42, type=int)
     args = parser.parse_args()
 
     # Set up training device
-    device = torch.device("cpu")
+    # device = torch.device("cpu")
 
     # Set seed
     set_seed(args)
 
     # Load pretrained model and tokenizer
-    config_class, model_class, tokenizer_class = BertConfig, Bert2Rnd, BertTokenizer
-    config = config_class.from_pretrained(args.model_name_or_path)
+    tokenizer_class = BertTokenizer
+    # config = config_class.from_pretrained(args.model_name_or_path)
     tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
-    model = model_class.from_pretrained(args.model_name_or_path, config=config)
-    model.to(device)
+    # model = model_class.from_pretrained(args.model_name_or_path, config=config)
+    # model.to(device)
 
     logger.info("Training/evaluation parameters %s", args)
 
     # Training
-    train_dataset = load_and_cache_examples(args, tokenizer)
-    global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-    logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+    _ = load_and_cache_examples(args, tokenizer)
+    # global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+    # logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
 
 if __name__ == "__main__":
diff --git a/examples/run_seq2seq_finetuning_test.py b/examples/run_seq2seq_finetuning_test.py
index 34d9add10d..aff39f25b8 100644
--- a/examples/run_seq2seq_finetuning_test.py
+++ b/examples/run_seq2seq_finetuning_test.py
@@ -14,50 +14,50 @@
 # limitations under the License.
 import unittest
 
-from .run_seq2seq_finetuning import process_story, _fit_to_block_size
+from run_seq2seq_finetuning import _fit_to_block_size
 
 
 class DataLoaderTest(unittest.TestCase):
-    def __init__(self, block_size=10):
-        self.block_size = block_size
+    def setUp(self):
+        self.block_size = 10
 
-    def source_and_target_too_small(self):
+    def test_source_and_target_too_small(self):
         """ When the sum of the lengths of the source and target sequences is
         smaller than the block size (minus the number of special tokens), skip the example. """
         src_seq = [1, 2, 3, 4]
         tgt_seq = [5, 6]
         self.assertEqual(_fit_to_block_size(src_seq, tgt_seq, self.block_size), None)
 
-    def source_and_target_fit_exactly(self):
+    def test_source_and_target_fit_exactly(self):
         """ When the sum of the lengths of the source and target sequences is
         equal to the block size (minus the number of special tokens), return the
         sequences unchanged. """
         src_seq = [1, 2, 3, 4]
         tgt_seq = [5, 6, 7]
         fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
-        self.assertListEqual(src_seq == fitted_src)
-        self.assertListEqual(tgt_seq == fitted_tgt)
+        self.assertListEqual(src_seq, fitted_src)
+        self.assertListEqual(tgt_seq, fitted_tgt)
 
-    def source_too_big_target_ok(self):
+    def test_source_too_big_target_ok(self):
         src_seq = [1, 2, 3, 4, 5, 6]
         tgt_seq = [1, 2]
         fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
-        self.assertListEqual(src_seq == [1, 2, 3, 4, 5])
-        self.assertListEqual(tgt_seq == fitted_tgt)
+        self.assertListEqual(fitted_src, [1, 2, 3, 4, 5])
+        self.assertListEqual(fitted_tgt, fitted_tgt)
 
-    def target_too_big_source_ok(self):
+    def test_target_too_big_source_ok(self):
         src_seq = [1, 2, 3, 4]
         tgt_seq = [1, 2, 3, 4]
         fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
-        self.assertListEqual(src_seq == src_seq)
-        self.assertListEqual(tgt_seq == [1, 2, 3])
+        self.assertListEqual(fitted_src, src_seq)
+        self.assertListEqual(fitted_tgt, [1, 2, 3])
 
-    def source_and_target_too_big(self):
+    def test_source_and_target_too_big(self):
         src_seq = [1, 2, 3, 4, 5, 6, 7]
         tgt_seq = [1, 2, 3, 4, 5, 6, 7]
         fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
-        self.assertListEqual(src_seq == [1, 2, 3, 4, 5])
-        self.assertListEqual(tgt_seq == [1, 2])
+        self.assertListEqual(fitted_src, [1, 2, 3, 4, 5])
+        self.assertListEqual(fitted_tgt, [1, 2])
 
 
 if __name__ == "__main__":

From 1aec940587255083b2451fc18aa604de29c1188c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi.louf@kering.com>
Date: Tue, 15 Oct 2019 15:18:07 +0200
Subject: [PATCH 063/144] test the full story processing

---
 examples/run_seq2seq_finetuning.py      | 32 +++++++++++------
 examples/run_seq2seq_finetuning_test.py | 46 +++++++++++++++++++++----
 2 files changed, 62 insertions(+), 16 deletions(-)

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index e926523a17..f05a5847ed 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -87,9 +87,9 @@ class TextDataset(Dataset):
             path_to_stories = os.path.join(data_dir, dataset, "stories")
             assert os.path.isdir(path_to_stories)
 
-            stories_files = os.listdir(path_to_stories)
-            for story_file in stories_files:
-                path_to_story = os.path.join(path_to_stories, "story_file")
+            story_filenames_list = os.listdir(path_to_stories)
+            for story_filename in story_filenames_list:
+                path_to_story = os.path.join(path_to_stories, story_filename)
                 if not os.path.isfile(path_to_story):
                     continue
 
@@ -97,16 +97,16 @@ class TextDataset(Dataset):
                     try:
                         raw_story = source.read()
                         story, summary = process_story(raw_story)
-                    except IndexError:
+                    except IndexError:  # skip ill-formed stories
                         continue
 
                 story = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(story))
                 summary = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(summary))
                 story_seq, summary_seq = _fit_to_block_size(story, summary, block_size)
-                example = tokenizer.add_special_token_sequence_pair(
-                    story_seq, summary_seq
+
+                self.examples.append(
+                    tokenizer.add_special_token_sequence_pair(story_seq, summary_seq)
                 )
-                self.examples.append(example)
 
         logger.info("Saving features into cache file %s", cached_features_file)
         with open(cached_features_file, "wb") as sink:
@@ -120,8 +120,13 @@ class TextDataset(Dataset):
 
 
 def process_story(raw_story):
-    """ Process the text contained in a story file.
-    Returns the story and the summary
+    """ Extract the story and summary from a story file.
+
+    Attributes:
+        raw_story (str): content of the story file as an utf-8 encoded string.
+
+    Raises:
+        IndexError: If the stoy is empty or contains no highlights.
     """
     file_lines = list(
         filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])
@@ -158,7 +163,7 @@ def _add_missing_period(line):
         return line
     if line[-1] in END_TOKENS:
         return line
-    return line + " ."
+    return line + "."
 
 
 def _fit_to_block_size(src_sequence, tgt_sequence, block_size):
@@ -169,6 +174,13 @@ def _fit_to_block_size(src_sequence, tgt_sequence, block_size):
     block size of 512 this means limiting the source sequence's length to 384
     and the target sequence's length to 128.
 
+    Attributes:
+        src_sequence (list): a list of ids that maps to the tokens of the
+            source sequence.
+        tgt_sequence (list): a list of ids that maps to the tokens of the
+            target sequence.
+        block_size (int): the model's block size.
+
     [1] Dong, Li, et al. "Unified Language Model Pre-training for Natural
     Language Understanding and Generation." arXiv preprint arXiv:1905.03197 (2019).
     """
diff --git a/examples/run_seq2seq_finetuning_test.py b/examples/run_seq2seq_finetuning_test.py
index aff39f25b8..e59f016da4 100644
--- a/examples/run_seq2seq_finetuning_test.py
+++ b/examples/run_seq2seq_finetuning_test.py
@@ -14,21 +14,21 @@
 # limitations under the License.
 import unittest
 
-from run_seq2seq_finetuning import _fit_to_block_size
+from run_seq2seq_finetuning import _fit_to_block_size, process_story
 
 
 class DataLoaderTest(unittest.TestCase):
     def setUp(self):
         self.block_size = 10
 
-    def test_source_and_target_too_small(self):
+    def test_truncate_source_and_target_too_small(self):
         """ When the sum of the lengths of the source and target sequences is
         smaller than the block size (minus the number of special tokens), skip the example. """
         src_seq = [1, 2, 3, 4]
         tgt_seq = [5, 6]
         self.assertEqual(_fit_to_block_size(src_seq, tgt_seq, self.block_size), None)
 
-    def test_source_and_target_fit_exactly(self):
+    def test_truncate_source_and_target_fit_exactly(self):
         """ When the sum of the lengths of the source and target sequences is
         equal to the block size (minus the number of special tokens), return the
         sequences unchanged. """
@@ -38,27 +38,61 @@ class DataLoaderTest(unittest.TestCase):
         self.assertListEqual(src_seq, fitted_src)
         self.assertListEqual(tgt_seq, fitted_tgt)
 
-    def test_source_too_big_target_ok(self):
+    def test_truncate_source_too_big_target_ok(self):
         src_seq = [1, 2, 3, 4, 5, 6]
         tgt_seq = [1, 2]
         fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
         self.assertListEqual(fitted_src, [1, 2, 3, 4, 5])
         self.assertListEqual(fitted_tgt, fitted_tgt)
 
-    def test_target_too_big_source_ok(self):
+    def test_truncate_target_too_big_source_ok(self):
         src_seq = [1, 2, 3, 4]
         tgt_seq = [1, 2, 3, 4]
         fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
         self.assertListEqual(fitted_src, src_seq)
         self.assertListEqual(fitted_tgt, [1, 2, 3])
 
-    def test_source_and_target_too_big(self):
+    def test_truncate_source_and_target_too_big(self):
         src_seq = [1, 2, 3, 4, 5, 6, 7]
         tgt_seq = [1, 2, 3, 4, 5, 6, 7]
         fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
         self.assertListEqual(fitted_src, [1, 2, 3, 4, 5])
         self.assertListEqual(fitted_tgt, [1, 2])
 
+    def test_process_story_no_highlights(self):
+        """ Processing a story with no highlights should raise an exception.
+        """
+        raw_story = """It was the year of Our Lord one thousand seven hundred and
+        seventy-five.\n\nSpiritual revelations were conceded to England at that
+        favoured period, as at this."""
+        with self.assertRaises(IndexError):
+            process_story(raw_story)
+
+    def test_process_empty_story(self):
+        """ An empty story should also raise and exception.
+        """
+        raw_story = ""
+        with self.assertRaises(IndexError):
+            process_story(raw_story)
+
+    def test_story_with_missing_period(self):
+        raw_story = (
+            "It was the year of Our Lord one thousand seven hundred and "
+            "seventy-five\n\nSpiritual revelations were conceded to England "
+            "at that favoured period, as at this.\n@highlight\n\nIt was the best of times"
+        )
+        story, summary = process_story(raw_story)
+
+        expected_story = (
+            "It was the year of Our Lord one thousand seven hundred and "
+            "seventy-five. Spiritual revelations were conceded to England at that "
+            "favoured period, as at this."
+        )
+        self.assertEqual(expected_story, story)
+
+        expected_summary = "It was the best of times."
+        self.assertEqual(expected_summary, summary)
+
 
 if __name__ == "__main__":
     unittest.main()

From 19e99647806ef597e2b21fd6ec2fed6624bdb696 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi.louf@kering.com>
Date: Tue, 15 Oct 2019 15:20:28 +0200
Subject: [PATCH 064/144] remove Bert2Bert from module declaration

---
 transformers/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/__init__.py b/transformers/__init__.py
index 006ba9ed16..5248bc9f1b 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -64,7 +64,7 @@ if is_torch_available():
                                 BertForMaskedLM, BertForNextSentencePrediction,
                                 BertForSequenceClassification, BertForMultipleChoice,
                                 BertForTokenClassification, BertForQuestionAnswering,
-                                load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP, Bert2Rnd)
+                                load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
     from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
                                 OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
                                 load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)

From 0d81fc853edac730067c0a2b3120dcc87ca6d15e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi.louf@kering.com>
Date: Tue, 15 Oct 2019 15:26:33 +0200
Subject: [PATCH 065/144] specify in readme that both datasets are required

---
 examples/README.md | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index ba58a61012..e0fe1fc704 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -395,13 +395,17 @@ This fine-tuned model is available as a checkpoint under the reference
 
 Based on the script [`run_seq2seq_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_seq2seq_finetuning.py).
 
-Before running this script you should download **both** CNN and Daily Mail datasets (the links next to "Stories") from [Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/) in the same folder. Then uncompress the archives by running:
+Before running this script you should download **both** CNN and Daily Mail
+datasets from [Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/)  (the
+links next to "Stories") in the same folder. Then uncompress the archives by running:
 
 ```bash
 tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
 ```
 
-We will refer as `$DATA_PATH` the path to where you uncompressed both archive.
+note that the finetuning script **will not work** if you do not download both
+datasets. We will refer as `$DATA_PATH` the path to where you uncompressed both
+archive.
 
 ## Bert2Bert and abstractive summarization
 
@@ -414,4 +418,4 @@ python run_seq2seq_finetuning.py \
     --model_name_or_path=bert2bert \
     --do_train \
     --data_path=$DATA_PATH \
-```
\ No newline at end of file
+```

From 6d6c32673726896d682f71a40476576972d127b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi.louf@kering.com>
Date: Tue, 15 Oct 2019 16:07:07 +0200
Subject: [PATCH 066/144] take path to pretrained for encoder and decoder for
 init

---
 transformers/modeling_seq2seq.py | 61 ++++++++++++++------------------
 1 file changed, 27 insertions(+), 34 deletions(-)

diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
index 466a101f47..2154a4699d 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_seq2seq.py
@@ -21,21 +21,20 @@ import logging
 import torch
 from torch import nn
 
-from .modeling_auto import AutoModel, AutoModelWithLMHead
-
-from .modeling_utils import PreTrainedModel, SequenceSummary
-
 from .file_utils import add_start_docstrings
+from .modeling_auto import AutoModel, AutoModelWithLMHead
+from .modeling_utils import PreTrainedModel, SequenceSummary
 
 logger = logging.getLogger(__name__)
 
 
 class PreTrainedSeq2seq(nn.Module):
     r"""
-        :class:`~transformers.Seq2seq` is a generic model class
-        that will be instantiated as a Seq2seq model with one of the base model classes of the library
-        as encoder and (optionally) as decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
-        class method.
+        :class:`~transformers.Seq2seq` is a generic model class that will be
+        instantiated as a Seq2seq model with one of the base model classes of
+        the library as encoder and (optionally) as decoder when created with
+        the `AutoModel.from_pretrained(pretrained_model_name_or_path)` class
+        method.
     """
     def __init__(self, encoder, decoder):
         super(PreTrainedSeq2seq, self).__init__()
@@ -43,7 +42,7 @@ class PreTrainedSeq2seq(nn.Module):
         self.decoder = decoder
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+    def from_pretrained(cls, encoder_pretrained_model_name_or_path, decoder_pretrained_model_name_or_path, *model_args, **kwargs):
         r""" Instantiates one of the base model classes of the library
         from a pre-trained model configuration.
             The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
@@ -100,40 +99,34 @@ class PreTrainedSeq2seq(nn.Module):
             # Loading from a TF checkpoint file instead of a PyTorch model (slower)
             config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
             model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
-
         """
-        # Extract encoder and decoder model if provided
-        encoder_model = kwargs.pop('encoder_model', None)
-        decoder_model = kwargs.pop('decoder_model', None)
 
-        # Extract decoder kwargs so we only have encoder kwargs for now
-        if decoder_model is None:
-            decoder_pretrained_model_name_or_path = kwargs.pop('decoder_pretrained_model_name_or_path', pretrained_model_name_or_path)
-        decoder_kwargs = {}
-        for key in kwargs.keys():
+        # Separate the encoder- and decoder- specific kwargs. A kwarg is
+        # decoder-specific it the key starts with `decoder_`
+        kwargs_decoder = {}
+        kwargs_encoder = kwargs
+        for key in kwargs_encoder.keys():
             if key.startswith('decoder_'):
-                decoder_kwargs[key.replace('decoder_', '')] = kwargs.pop(key)
+                kwargs_decoder[key.replace('decoder_', '')] = kwargs_encoder.pop(key)
 
-        # Load and initialize the decoder
-        if encoder_model:
-            encoder = encoder_model
-        else:
-            # Load and initialize the encoder
-            kwargs['is_decoder'] = False  # Make sure the encoder will be an encoder
-            encoder = AutoModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        # Load and initialize the encoder and decoder
+        #  The distinction between encoder and decoder at the model level is made
+        #  by the value of the flag `is_decoder` that we need to set correctly.
+        encoder = kwargs.pop('encoder_model', None)
+        if encoder is None:
+            kwargs_encoder['is_decoder'] = False
+            encoder = AutoModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs_encoder)
 
-        # Load and initialize the decoder
-        if decoder_model:
-            decoder = decoder_model
-        else:
-            kwargs.update(decoder_kwargs)  # Replace encoder kwargs with decoder specific kwargs like config, state_dict, etc...
-            kwargs['is_decoder'] = True  # Make sure the decoder will be a decoder
-            decoder = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
+        decoder = kwargs.pop('decoder_model', None)
+        if decoder is None:
+            kwargs_decoder['is_decoder'] = True
+            decoder_model = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
 
         model = cls(encoder, decoder)
+
         return model
 
-    def forward(self, *inputs, *kwargs):
+    def forward(self, *inputs, **kwargs):
         # Extract decoder inputs
         decoder_kwargs = {}
         for key in kwargs.keys():

From 4c81960b9bc0f553ddf800df16bb82804e162bcb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi.louf@kering.com>
Date: Tue, 15 Oct 2019 17:53:38 +0200
Subject: [PATCH 067/144] comment the seq2seq functions

---
 transformers/modeling_seq2seq.py | 81 +++++++++++++++++++-------------
 1 file changed, 49 insertions(+), 32 deletions(-)

diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
index 2154a4699d..b326f2bc1e 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_seq2seq.py
@@ -43,13 +43,21 @@ class PreTrainedSeq2seq(nn.Module):
 
     @classmethod
     def from_pretrained(cls, encoder_pretrained_model_name_or_path, decoder_pretrained_model_name_or_path, *model_args, **kwargs):
-        r""" Instantiates one of the base model classes of the library
-        from a pre-trained model configuration.
-            The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
-            To train the model, you should first set it back in training mode with `model.train()`
+        r""" Instantiates an encoder and a decoder from one or two base classes
+        of the library from pre-trained model checkpoints.
+
+
+        The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
+        To train the model, you need to first set it back in training mode with `model.train()`
 
         Params:
-            pretrained_model_name_or_path: either:
+            encoder_pretrained_model_name_or_path: information necessary to initiate the encoder. Either:
+
+                - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
+
+            decoder_pretrained_model_name_or_path: information necessary to initiate the decoder. Either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
@@ -84,21 +92,17 @@ class PreTrainedSeq2seq(nn.Module):
             output_loading_info: (`optional`) boolean:
                 Set to ``True`` to also return a dictionnary containing missing keys, unexpected keys and error messages.
 
-            kwargs: (`optional`) Remaining dictionary of keyword arguments:
+            kwargs: (`optional`) Remaining dictionary of keyword arguments.
                 Can be used to update the configuration object (after it being loaded) and initiate the model. (e.g. ``output_attention=True``). Behave differently depending on whether a `config` is provided or automatically loaded:
 
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                 - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
+                You can specify different kwargs for the decoder by prefixing the key with `decoder_` (e.g. ``decoder_output_attention=True``).
+
         Examples::
 
-            model = AutoModel.from_pretrained('bert-base-uncased')    # Download model and configuration from S3 and cache.
-            model = AutoModel.from_pretrained('./test/bert_model/')  # E.g. model was saved using `save_pretrained('./test/saved_model/')`
-            model = AutoModel.from_pretrained('bert-base-uncased', output_attention=True)  # Update configuration during loading
-            assert model.config.output_attention == True
-            # Loading from a TF checkpoint file instead of a PyTorch model (slower)
-            config = AutoConfig.from_json_file('./tf_model/bert_tf_model_config.json')
-            model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
+            model = PreTrainedSeq2seq.from_pretained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
         """
 
         # Separate the encoder- and decoder- specific kwargs. A kwarg is
@@ -115,35 +119,49 @@ class PreTrainedSeq2seq(nn.Module):
         encoder = kwargs.pop('encoder_model', None)
         if encoder is None:
             kwargs_encoder['is_decoder'] = False
-            encoder = AutoModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs_encoder)
+            encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
 
         decoder = kwargs.pop('decoder_model', None)
         if decoder is None:
             kwargs_decoder['is_decoder'] = True
-            decoder_model = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs)
+            decoder = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
 
         model = cls(encoder, decoder)
 
         return model
 
     def forward(self, *inputs, **kwargs):
-        # Extract decoder inputs
-        decoder_kwargs = {}
-        for key in kwargs.keys():
-            if key.startswith('decoder_'):
-                decoder_kwargs[key.replace('decoder_', '')] = kwargs.pop(key)
+        """ The forward pass on a seq2eq depends what we are performing:
 
-        # Compute encoder hidden states if needed
-        encoder_hidden_states = kwargs.pop('encoder_hidden_states', None)
+        - During training we perform one forward pass through both the encoder
+          and decoder;
+        - During prediction, we perform one forward pass through the encoder,
+          and then perform several forward passes with the encoder's hidden
+          state through the decoder to decode a full sequence.
+
+        Therefore, we skip the forward pass on the encoder if an argument named
+        `encoder_hidden_state` is passed to this function.
+
+        """
+        # Separate the encoder- and decoder- specific kwargs. A kwarg is
+        # decoder-specific it the key starts with `decoder_`
+        kwargs_decoder = {}
+        kwargs_encoder = kwargs
+        for key in kwargs_encoder.keys():
+            if key.startswith('decoder_'):
+                kwargs_decoder[key.replace('decoder_', '')] = kwargs_encoder.pop(key)
+
+        # Encode if needed (training, first prediction pass)
+        encoder_hidden_states = kwargs_encoder.pop('encoder_hidden_states', None)
         if encoder_hidden_states is None:
-            encoder_outputs = self.encoder(*inputs, *kwargs)
+            encoder_outputs = self.encoder(*inputs, **kwargs_encoder)
             encoder_hidden_states = encoder_outputs[0]
         else:
-            encoder_outputs = (,)
+            encoder_outputs = ()
 
         # Decode
-        decoder_kwargs['encoder_hidden_states'] = encoder_hidden_states
-        decoder_outputs = self.decoder(**decoder_kwargs)
+        kwargs_decoder['encoder_hidden_states'] = encoder_hidden_states
+        decoder_outputs = self.decoder(**kwargs_decoder)
 
         return decoder_outputs + encoder_outputs
 
@@ -161,11 +179,10 @@ class Model2LSTM(PreTrainedSeq2seq):
             # We will create a randomly initilized LSTM model as decoder
             if 'decoder_config' not in kwargs:
                 raise ValueError("To load an LSTM in Seq2seq model, please supply either: "
-                                "    - a torch.nn.LSTM model as `decoder_model` parameter (`decoder_model=lstm_model`), or "
-                                "    - a dictionary of configuration parameters that will be used to initialize a
-                                "        torch.nn.LSTM model as `decoder_config` keyword argument. "
-                                "        E.g. `decoder_config=\{'input_size': 768, 'hidden_size': 768, 'num_layers': 2\}`")
-            kwargs['decoder_model'] = torch.nn.LSTM(kwarg.pop('decoder_config'))
+                                 "    - a torch.nn.LSTM model as `decoder_model` parameter (`decoder_model=lstm_model`), or"
+                                 "    - a dictionary of configuration parameters that will be used to initialize a"
+                                 "      torch.nn.LSTM model as `decoder_config` keyword argument. "
+                                 "      E.g. `decoder_config={'input_size': 768, 'hidden_size': 768, 'num_layers': 2}`")
+            kwargs['decoder_model'] = torch.nn.LSTM(kwargs.pop('decoder_config'))
         model = super(Model2LSTM, cls).from_pretrained(*args, **kwargs)
         return model
-

From 488a6641513face9c03b537b6fe3210dbdb39f36 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi.louf@kering.com>
Date: Tue, 15 Oct 2019 21:03:32 +0200
Subject: [PATCH 068/144] add `is_decoder` attribute to `PretrainedConfig`

We currenctly instantiate encoders and decoders for the seq2seq by
passing the `is_decoder` keyword argument to the `from_pretrained`
classmethod. On the other hand, the model class looks for the value
of the `is_decoder` attribute in its config.

In order for the value to propagate from the kwarg to the configuration
we simply need to define `is_decoder` as an attribute to the base
`PretrainedConfig`, with a default at `False`.
---
 transformers/configuration_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py
index 8a23be4ff6..228150fc89 100644
--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -56,6 +56,7 @@ class PretrainedConfig(object):
         self.torchscript = kwargs.pop('torchscript', False)
         self.use_bfloat16 = kwargs.pop('use_bfloat16', False)
         self.pruned_heads = kwargs.pop('pruned_heads', {})
+        self.is_decoder = kwargs.pop('is_decoder', False)
 
     def save_pretrained(self, save_directory):
         """ Save a configuration object to the directory `save_directory`, so that it

From c5a94a6100afdd550fb3ea445d8bddc6b9769fcc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi.louf@kering.com>
Date: Wed, 16 Oct 2019 12:50:36 +0200
Subject: [PATCH 069/144] fix function that defines masks in XLM

the definition of `get_masks` would blow with the proper combination of
arguments. It was just a matter of moving a definition outside of a
control structure.
---
 transformers/modeling_xlm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py
index b29e721556..f1df6f668f 100644
--- a/transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -73,16 +73,16 @@ def get_masks(slen, lengths, causal, padding_mask=None):
     """
     Generate hidden states mask, and optionally an attention mask.
     """
-    bs = lengths.size(0)
+    alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
     if padding_mask is not None:
         mask = padding_mask
     else:
         assert lengths.max().item() <= slen
-        alen = torch.arange(slen, dtype=torch.long, device=lengths.device)
         mask = alen < lengths[:, None]
 
     # attention mask is the same as mask, or triangular inferior attention (causal)
     if causal:
+        bs = lengths.size(0)
         attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
     else:
         attn_mask = mask

From 075206961700fd2359f5c5cfc86a8c18d8404406 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remi.louf@kering.com>
Date: Wed, 16 Oct 2019 16:12:22 +0200
Subject: [PATCH 070/144] adapt attention masks for the decoder case

The introduction of a decoder introduces 2 changes:
- We need to be able to specify a separate mask in the cross
attention to mask the positions corresponding to padding tokens in the
encoder state.
- The self-attention in the decoder needs to be causal on top of not
attending to padding tokens.
---
 transformers/modeling_bert.py | 66 +++++++++++++++++++++++++++--------
 1 file changed, 51 insertions(+), 15 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index fbf3c84646..cd9151cf62 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -198,12 +198,16 @@ class BertSelfAttention(nn.Module):
         x = x.view(*new_x_shape)
         return x.permute(0, 2, 1, 3)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
         mixed_query_layer = self.query(hidden_states)
+
         # if the attention Module is a encoder-decoder self attention module
+        # they keys & values are given by the encoder; the attention mask
+        # needs to be such that there is no atention on the encoder's padding tokens.
         if encoder_hidden_states is not None:
             mixed_key_layer = self.key(encoder_hidden_states)
             mixed_value_layer = self.value(encoder_hidden_states)
+            attention_mask = encoder_attention_mask
         else:
             mixed_key_layer = self.key(hidden_states)
             mixed_value_layer = self.value(hidden_states)
@@ -284,8 +288,8 @@ class BertAttention(nn.Module):
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None):
-        self_outputs = self.self(hidden_states, attention_mask, head_mask, encoder_hidden_states)
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
+        self_outputs = self.self(hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask)
         attention_output = self.output(self_outputs[0], hidden_states)
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
@@ -330,13 +334,13 @@ class BertLayer(nn.Module):
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_state=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_state=None, encoder_attention_mask=None):
         self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
         attention_output = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 
         if self.is_decoder and encoder_hidden_state is not None:
-            cross_attention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_state)
+            cross_attention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_state, encoder_attention_mask)
             attention_output = cross_attention_outputs[0]
             outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
 
@@ -346,6 +350,7 @@ class BertLayer(nn.Module):
         return outputs
 
 
+# NOTE I think we may need to call encoder_hidden_states[i] for each layer
 class BertEncoder(nn.Module):
     def __init__(self, config):
         super(BertEncoder, self).__init__()
@@ -353,14 +358,14 @@ class BertEncoder(nn.Module):
         self.output_hidden_states = config.output_hidden_states
         self.layer = nn.ModuleList([BertLayer(config) for _ in range(config.num_hidden_layers)])
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
         all_hidden_states = ()
         all_attentions = ()
         for i, layer_module in enumerate(self.layer):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
-            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], encoder_hidden_states)
+            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask)
             hidden_states = layer_outputs[0]
 
             if self.output_attentions:
@@ -579,6 +584,7 @@ class BertModel(BertPreTrainedModel):
     """
     def __init__(self, config):
         super(BertModel, self).__init__(config)
+        self.config = config
 
         self.embeddings = BertEmbeddings(config)
         self.encoder = BertEncoder(config)
@@ -601,18 +607,47 @@ class BertModel(BertPreTrainedModel):
             self.encoder.layer[layer].attention.prune_heads(heads)
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None,
-                head_mask=None, encoder_hidden_state=None):
+                head_mask=None, encoder_hidden_state=None, encoder_attention_mask=None):
+        """ Forward pass on the Model.
+
+        The model can behave as an encoder (with only self-attention) as well
+        as a decoder, in which case a layer of cross-attention is added between
+        ever self-attention layer, following the architecture described in [1].
+
+        To behave like as a decoder the model needs to be initialized with the
+        `is_decoder` argument of the config set to `True`. An
+        `encoder_hidden_state` is expected as an input to the forward pass.
+        When a decoder, there are two kinds of attention masks to specify:
+
+        (1) Self-attention masks that need to be causal (only attends to
+        previous tokens);
+        (2) A cross-attention mask that prevents the module
+        from attending to the encoder' padding tokens.
+
+        [1] Vaswani, Ashish, et al. "Attention is all you need." Advances in
+        neural information processing systems. 2017.
+        """
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
         if token_type_ids is None:
             token_type_ids = torch.zeros_like(input_ids)
 
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+        # we may want to provide a mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just make it broadcastable to all heads.
+        if attention_mask.dims() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+
+        # provided a padding mask of dimensions [batch_size, seq_length]
+        # - if encoder, make it broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        # - if decoder, make it causal
+        if attention_mask.dims() == 2:
+            if self.config.is_decoder:
+                batch_size, seq_length = input_ids.size()
+                seq_ids = torch.arange(seq_length)
+                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[None, None, :, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
 
         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
         # masked positions, this operation will create a tensor which is 0.0 for
@@ -641,7 +676,8 @@ class BertModel(BertPreTrainedModel):
         encoder_outputs = self.encoder(embedding_output,
                                        attention_mask=extended_attention_mask,
                                        head_mask=head_mask,
-                                       encoder_hidden_state=encoder_hidden_state)
+                                       encoder_hidden_state=encoder_hidden_state,
+                                       encoder_attention_mask=encoder_attention_mask)
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
 

From c5441946112e68441b46866d114bf8d3c29b0c1d Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Wed, 16 Oct 2019 11:05:13 -0400
Subject: [PATCH 071/144] Remove `special_tokens_mask` from inputs in README

Co-authored-by: Thomas Wolf @thomwolf
---
 README.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index e44ff52099..da0de4ae81 100644
--- a/README.md
+++ b/README.md
@@ -243,8 +243,9 @@ sentence_2 = "His findings were not compatible with this research."
 inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
 inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
 
-pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
-pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
+pred_1 = pytorch_model(inputs_1['input_ids'], token_type_ids=inputs_1['token_type_ids'])[0].argmax().item()
+pred_2 = pytorch_model(inputs_2['input_ids'], token_type_ids=inputs_2['token_type_ids'])[0].argmax().item()
+
 print("sentence_1 is", "a paraphrase" if pred_1 else "not a paraphrase", "of sentence_0")
 print("sentence_2 is", "a paraphrase" if pred_2 else "not a paraphrase", "of sentence_0")
 ```

From 33c01368b19701bc6e5ea886f108663752d31d86 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Wed, 16 Oct 2019 18:13:05 +0200
Subject: [PATCH 072/144] remove Bert2Rnd test

---
 transformers/tests/modeling_bert_test.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
index e649cd8ce8..6c39c4e4db 100644
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -29,7 +29,7 @@ if is_torch_available():
     from transformers import (BertConfig, BertModel, BertForMaskedLM,
                               BertForNextSentencePrediction, BertForPreTraining,
                               BertForQuestionAnswering, BertForSequenceClassification,
-                              BertForTokenClassification, BertForMultipleChoice, Bert2Rnd)
+                              BertForTokenClassification, BertForMultipleChoice)
     from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
 else:
     pytestmark = pytest.mark.skip("Require Torch")
@@ -255,17 +255,6 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 [self.batch_size, self.num_choices])
             self.check_loss_output(result)
 
-        def create_and_check_bert2rnd(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
-            config.num_choices = self.num_choices
-            model = Bert2Rnd(config=config)
-            model.eval()
-            bert2rnd_inputs_ids = input_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            bert2rnd_token_type_ids = token_type_ids.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            bert2rnd_input_mask = input_mask.unsqueeze(1).expand(-1, self.num_choices, -1).contiguous()
-            _ = model(bert2rnd_inputs_ids,
-                      attention_mask=bert2rnd_input_mask,
-                      token_type_ids=bert2rnd_token_type_ids)
-
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids, token_type_ids, input_mask,

From a424892fab8ddfe631d7498bc44072aa3a42eb3c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Wed, 16 Oct 2019 18:24:32 +0200
Subject: [PATCH 073/144] correct syntax error: dim() and not dims()

---
 transformers/modeling_bert.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index cd9151cf62..e717031dcb 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -634,13 +634,13 @@ class BertModel(BertPreTrainedModel):
 
         # we may want to provide a mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just make it broadcastable to all heads.
-        if attention_mask.dims() == 3:
+        if attention_mask.dim() == 3:
             extended_attention_mask = attention_mask[:, None, :, :]
 
         # provided a padding mask of dimensions [batch_size, seq_length]
         # - if encoder, make it broadcastable to [batch_size, num_heads, seq_length, seq_length]
         # - if decoder, make it causal
-        if attention_mask.dims() == 2:
+        if attention_mask.dim() == 2:
             if self.config.is_decoder:
                 batch_size, seq_length = input_ids.size()
                 seq_ids = torch.arange(seq_length)
@@ -816,13 +816,15 @@ class BertForMaskedLM(BertPreTrainedModel):
                                    self.bert.embeddings.word_embeddings)
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                masked_lm_labels=None):
+                masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None):
 
         outputs = self.bert(input_ids,
                             attention_mask=attention_mask,
                             token_type_ids=token_type_ids,
                             position_ids=position_ids,
-                            head_mask=head_mask)
+                            head_mask=head_mask,
+                            encoder_hidden_states=encoder_hidden_states,
+                            encoder_attention_mask=encoder_attention_mask)
 
         sequence_output = outputs[0]
         prediction_scores = self.cls(sequence_output)
@@ -833,6 +835,15 @@ class BertForMaskedLM(BertPreTrainedModel):
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
             outputs = (masked_lm_loss,) + outputs
 
+        if encoder_hidden_states is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+
+            # shift predictions scores and input ids by one before computing loss
+            prediction_scores = prediction_scores[:, :-1, :]
+            input_ids = input_ids[:, 1:, :]
+            seq2seq_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), input_ids.view(-1))
+            outputs = (seq2seq_loss,) + outputs
+
         return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
 
 

From e4e0ee14bd481fe32e82578665284ea5bf4f5677 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Wed, 16 Oct 2019 20:05:32 +0200
Subject: [PATCH 074/144] add separator between data import and train

---
 examples/run_seq2seq_finetuning.py | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index f05a5847ed..2e8d0aa250 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -52,6 +52,10 @@ def set_seed(args):
     torch.manual_seed(args.seed)
 
 
+# ------------
+# Load dataset
+# ------------
+
 class TextDataset(Dataset):
     """ Abstracts the dataset used to train seq2seq models.
 
@@ -212,6 +216,11 @@ def load_and_cache_examples(args, tokenizer):
     return dataset
 
 
+# ------------
+# Train
+# ------------
+
+
 def train(args, train_dataset, model, tokenizer):
     """ Fine-tune the pretrained model on the corpus. """
     raise NotImplementedError

From 95ec1d08bef7b780653f9f2c59fddb4a97873cff Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Wed, 16 Oct 2019 20:55:42 +0200
Subject: [PATCH 075/144] separate inputs into encoder & decoder inputs

---
 transformers/modeling_seq2seq.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
index b326f2bc1e..8f27224a56 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_seq2seq.py
@@ -130,7 +130,7 @@ class PreTrainedSeq2seq(nn.Module):
 
         return model
 
-    def forward(self, *inputs, **kwargs):
+    def forward(self, encoder_input_ids, decoder_input_ids, **kwargs):
         """ The forward pass on a seq2eq depends what we are performing:
 
         - During training we perform one forward pass through both the encoder
@@ -142,6 +142,11 @@ class PreTrainedSeq2seq(nn.Module):
         Therefore, we skip the forward pass on the encoder if an argument named
         `encoder_hidden_state` is passed to this function.
 
+        Params:
+            encoder_input_ids: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``
+                Indices of encoder input sequence tokens in the vocabulary.
+            decoder_input_ids: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``
+                Indices of decoder input sequence tokens in the vocabulary.
         """
         # Separate the encoder- and decoder- specific kwargs. A kwarg is
         # decoder-specific it the key starts with `decoder_`
@@ -154,14 +159,14 @@ class PreTrainedSeq2seq(nn.Module):
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop('encoder_hidden_states', None)
         if encoder_hidden_states is None:
-            encoder_outputs = self.encoder(*inputs, **kwargs_encoder)
+            encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
             encoder_hidden_states = encoder_outputs[0]
         else:
             encoder_outputs = ()
 
         # Decode
         kwargs_decoder['encoder_hidden_states'] = encoder_hidden_states
-        decoder_outputs = self.decoder(**kwargs_decoder)
+        decoder_outputs = self.decoder(decoder_input_ids, **kwargs_decoder)
 
         return decoder_outputs + encoder_outputs
 

From 9b71fc9a18bbd49a699a338abe1891320c818108 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Wed, 16 Oct 2019 21:31:38 +0200
Subject: [PATCH 076/144] tying weights is going to be a clusterfuck

---
 transformers/modeling_seq2seq.py | 81 ++++++++++++++++++++++----------
 1 file changed, 56 insertions(+), 25 deletions(-)

diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
index 8f27224a56..4e76a1b8e7 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_seq2seq.py
@@ -28,7 +28,7 @@ from .modeling_utils import PreTrainedModel, SequenceSummary
 logger = logging.getLogger(__name__)
 
 
-class PreTrainedSeq2seq(nn.Module):
+class PreTrainedSeq2seq(PreTrainedModel):
     r"""
         :class:`~transformers.Seq2seq` is a generic model class that will be
         instantiated as a Seq2seq model with one of the base model classes of
@@ -36,13 +36,20 @@ class PreTrainedSeq2seq(nn.Module):
         the `AutoModel.from_pretrained(pretrained_model_name_or_path)` class
         method.
     """
+
     def __init__(self, encoder, decoder):
         super(PreTrainedSeq2seq, self).__init__()
         self.encoder = encoder
         self.decoder = decoder
 
     @classmethod
-    def from_pretrained(cls, encoder_pretrained_model_name_or_path, decoder_pretrained_model_name_or_path, *model_args, **kwargs):
+    def from_pretrained(
+        cls,
+        encoder_pretrained_model_name_or_path,
+        decoder_pretrained_model_name_or_path,
+        *model_args,
+        **kwargs
+    ):
         r""" Instantiates an encoder and a decoder from one or two base classes
         of the library from pre-trained model checkpoints.
 
@@ -110,21 +117,25 @@ class PreTrainedSeq2seq(nn.Module):
         kwargs_decoder = {}
         kwargs_encoder = kwargs
         for key in kwargs_encoder.keys():
-            if key.startswith('decoder_'):
-                kwargs_decoder[key.replace('decoder_', '')] = kwargs_encoder.pop(key)
+            if key.startswith("decoder_"):
+                kwargs_decoder[key.replace("decoder_", "")] = kwargs_encoder.pop(key)
 
         # Load and initialize the encoder and decoder
         #  The distinction between encoder and decoder at the model level is made
         #  by the value of the flag `is_decoder` that we need to set correctly.
-        encoder = kwargs.pop('encoder_model', None)
+        encoder = kwargs.pop("encoder_model", None)
         if encoder is None:
-            kwargs_encoder['is_decoder'] = False
-            encoder = AutoModel.from_pretrained(encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder)
+            kwargs_encoder["is_decoder"] = False
+            encoder = AutoModel.from_pretrained(
+                encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder
+            )
 
-        decoder = kwargs.pop('decoder_model', None)
+        decoder = kwargs.pop("decoder_model", None)
         if decoder is None:
-            kwargs_decoder['is_decoder'] = True
-            decoder = AutoModelWithLMHead.from_pretrained(decoder_pretrained_model_name_or_path, **kwargs_decoder)
+            kwargs_decoder["is_decoder"] = True
+            decoder = AutoModelWithLMHead.from_pretrained(
+                decoder_pretrained_model_name_or_path, **kwargs_decoder
+            )
 
         model = cls(encoder, decoder)
 
@@ -153,11 +164,11 @@ class PreTrainedSeq2seq(nn.Module):
         kwargs_decoder = {}
         kwargs_encoder = kwargs
         for key in kwargs_encoder.keys():
-            if key.startswith('decoder_'):
-                kwargs_decoder[key.replace('decoder_', '')] = kwargs_encoder.pop(key)
+            if key.startswith("decoder_"):
+                kwargs_decoder[key.replace("decoder_", "")] = kwargs_encoder.pop(key)
 
         # Encode if needed (training, first prediction pass)
-        encoder_hidden_states = kwargs_encoder.pop('encoder_hidden_states', None)
+        encoder_hidden_states = kwargs_encoder.pop("encoder_hidden_states", None)
         if encoder_hidden_states is None:
             encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
             encoder_hidden_states = encoder_outputs[0]
@@ -165,29 +176,49 @@ class PreTrainedSeq2seq(nn.Module):
             encoder_outputs = ()
 
         # Decode
-        kwargs_decoder['encoder_hidden_states'] = encoder_hidden_states
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
         decoder_outputs = self.decoder(decoder_input_ids, **kwargs_decoder)
 
         return decoder_outputs + encoder_outputs
 
 
 class Model2Model(PreTrainedSeq2seq):
-    def tie_weights():
-        # We should tie encoder and decoder embeddings if possible here
-        pass
+    def __init__(self):
+        super(Model2Model, self).__init__()
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Tying the encoder and decoders' embeddings together.
+
+        We need for each to get down to the embedding weights. However the
+        different model classes are inconsistent to that respect:
+        - BertModel: embeddings.word_embeddings
+        - RoBERTa: embeddings.word_embeddings
+        - XLMModel: embeddings
+        - GPT2: wte
+        - BertForMaskedLM: bert.embeddings.word_embeddings
+        - RobertaForMaskedLM: roberta.embeddings.word_embeddings
+
+        argument of the XEmbedding layer for each model, but it is "blocked"
+        by a model-specific keyword (bert, )...
+        """
+        # self._tie_or_clone_weights(self.encoder, self.decoder)
+        raise NotImplementedError
 
 
 class Model2LSTM(PreTrainedSeq2seq):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
-        if kwargs.get('decoder_model', None) is None:
+        if kwargs.get("decoder_model", None) is None:
             # We will create a randomly initilized LSTM model as decoder
-            if 'decoder_config' not in kwargs:
-                raise ValueError("To load an LSTM in Seq2seq model, please supply either: "
-                                 "    - a torch.nn.LSTM model as `decoder_model` parameter (`decoder_model=lstm_model`), or"
-                                 "    - a dictionary of configuration parameters that will be used to initialize a"
-                                 "      torch.nn.LSTM model as `decoder_config` keyword argument. "
-                                 "      E.g. `decoder_config={'input_size': 768, 'hidden_size': 768, 'num_layers': 2}`")
-            kwargs['decoder_model'] = torch.nn.LSTM(kwargs.pop('decoder_config'))
+            if "decoder_config" not in kwargs:
+                raise ValueError(
+                    "To load an LSTM in Seq2seq model, please supply either: "
+                    "    - a torch.nn.LSTM model as `decoder_model` parameter (`decoder_model=lstm_model`), or"
+                    "    - a dictionary of configuration parameters that will be used to initialize a"
+                    "      torch.nn.LSTM model as `decoder_config` keyword argument. "
+                    "      E.g. `decoder_config={'input_size': 768, 'hidden_size': 768, 'num_layers': 2}`"
+                )
+            kwargs["decoder_model"] = torch.nn.LSTM(kwargs.pop("decoder_config"))
         model = super(Model2LSTM, cls).from_pretrained(*args, **kwargs)
         return model

From 624a5644cc9585b19c90cb2a20e343f7ff326d82 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 09:27:56 +0200
Subject: [PATCH 077/144] revert black formatting to conform with lib style

---
 transformers/modeling_seq2seq.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
index 4e76a1b8e7..cc5cc53bc3 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_seq2seq.py
@@ -43,13 +43,7 @@ class PreTrainedSeq2seq(PreTrainedModel):
         self.decoder = decoder
 
     @classmethod
-    def from_pretrained(
-        cls,
-        encoder_pretrained_model_name_or_path,
-        decoder_pretrained_model_name_or_path,
-        *model_args,
-        **kwargs
-    ):
+    def from_pretrained(cls, encoder_pretrained_model_name_or_path, decoder_pretrained_model_name_or_path, *model_args, **kwargs):
         r""" Instantiates an encoder and a decoder from one or two base classes
         of the library from pre-trained model checkpoints.
 
@@ -190,7 +184,7 @@ class Model2Model(PreTrainedSeq2seq):
     def tie_weights(self):
         """ Tying the encoder and decoders' embeddings together.
 
-        We need for each to get down to the embedding weights. However the
+       We need for each to get down to the embedding weights. However the
         different model classes are inconsistent to that respect:
         - BertModel: embeddings.word_embeddings
         - RoBERTa: embeddings.word_embeddings

From 4e0f24348fcc9902664951677ffc7c8cc171443d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 09:41:53 +0200
Subject: [PATCH 078/144] document the MLM modification + raise exception on
 MLM training with encoder-decoder

---
 transformers/modeling_bert.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index e717031dcb..2553bc0efb 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -830,21 +830,30 @@ class BertForMaskedLM(BertPreTrainedModel):
         prediction_scores = self.cls(sequence_output)
 
         outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+
+        # Although this may seem awkward, BertForMaskedLM supports two scenarios:
+        # 1. If a tensor that contains the indices of masked labels is provided,
+        #    the cross-entropy is the MLM cross-entropy that measures the likelihood
+        #    of predictions for masked words.
+        # 2. If encoder hidden states are provided we are in a causal situation where we
+        #    try to predict the next word for each input in the encoder.
+        if masked_lm_labels is not None and encoder_hidden_states is not None:
+            raise AttributeError("Masked LM training with an encoder-decoder is not supported.")
+
         if masked_lm_labels is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss_fct = CrossEntropyLoss(ignore_index=-1)  # -1 index = padding token
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
             outputs = (masked_lm_loss,) + outputs
 
         if encoder_hidden_states is not None:
-            loss_fct = CrossEntropyLoss(ignore_index=-1)
-
-            # shift predictions scores and input ids by one before computing loss
+            # we are doing next-token prediction; shift prediction scores and input ids by one
             prediction_scores = prediction_scores[:, :-1, :]
             input_ids = input_ids[:, 1:, :]
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
             seq2seq_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), input_ids.view(-1))
             outputs = (seq2seq_loss,) + outputs
 
-        return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
+        return outputs  # (mlm_or_seq2seq_loss), prediction_scores, (hidden_states), (attentions)
 
 
 @add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,

From 638fe7f5a4f5c3bec5b39cee374f13b4675cdb18 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 10:13:07 +0200
Subject: [PATCH 079/144] correct composition of padding and causal masks

---
 transformers/modeling_bert.py | 32 ++++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 2553bc0efb..05ab3395de 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -288,8 +288,8 @@ class BertAttention(nn.Module):
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
-        self_outputs = self.self(hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask)
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_state=None, encoder_attention_mask=None):
+        self_outputs = self.self(hidden_states, attention_mask, head_mask, encoder_hidden_state, encoder_attention_mask)
         attention_output = self.output(self_outputs[0], hidden_states)
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
@@ -350,7 +350,6 @@ class BertLayer(nn.Module):
         return outputs
 
 
-# NOTE I think we may need to call encoder_hidden_states[i] for each layer
 class BertEncoder(nn.Module):
     def __init__(self, config):
         super(BertEncoder, self).__init__()
@@ -365,7 +364,8 @@ class BertEncoder(nn.Module):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
-            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask)
+            encoder_hidden_state = encoder_hidden_states[i]
+            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], encoder_hidden_state, encoder_attention_mask)
             hidden_states = layer_outputs[0]
 
             if self.output_attentions:
@@ -607,22 +607,26 @@ class BertModel(BertPreTrainedModel):
             self.encoder.layer[layer].attention.prune_heads(heads)
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None,
-                head_mask=None, encoder_hidden_state=None, encoder_attention_mask=None):
+                head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
         """ Forward pass on the Model.
 
+        The values of the attention matrix (shape [batch_size, seq_length])
+        should be 1.0 for the position we want to attend to and 0. for the ones
+        we do not want to attend to.
+
         The model can behave as an encoder (with only self-attention) as well
         as a decoder, in which case a layer of cross-attention is added between
         ever self-attention layer, following the architecture described in [1].
 
         To behave like as a decoder the model needs to be initialized with the
         `is_decoder` argument of the config set to `True`. An
-        `encoder_hidden_state` is expected as an input to the forward pass.
+        `encoder_hidden_states` is expected as an input to the forward pass.
         When a decoder, there are two kinds of attention masks to specify:
 
         (1) Self-attention masks that need to be causal (only attends to
         previous tokens);
         (2) A cross-attention mask that prevents the module
-        from attending to the encoder' padding tokens.
+        from attending to the encoder's padding tokens.
 
         [1] Vaswani, Ashish, et al. "Attention is all you need." Advances in
         neural information processing systems. 2017.
@@ -632,20 +636,20 @@ class BertModel(BertPreTrainedModel):
         if token_type_ids is None:
             token_type_ids = torch.zeros_like(input_ids)
 
-        # we may want to provide a mask of dimensions [batch_size, from_seq_length, to_seq_length]
-        # ourselves in which case we just make it broadcastable to all heads.
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
         if attention_mask.dim() == 3:
             extended_attention_mask = attention_mask[:, None, :, :]
 
-        # provided a padding mask of dimensions [batch_size, seq_length]
-        # - if encoder, make it broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        # - if decoder, make it causal
+        # Provided a padding mask of dimensions [batch_size, seq_length]
+        # - if the model is a decoder, apply a causal mask in addition to the padding mask
+        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if attention_mask.dim() == 2:
             if self.config.is_decoder:
                 batch_size, seq_length = input_ids.size()
                 seq_ids = torch.arange(seq_length)
                 causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
-                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[None, None, :, :]
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
             else:
                 extended_attention_mask = attention_mask[:, None, None, :]
 
@@ -676,7 +680,7 @@ class BertModel(BertPreTrainedModel):
         encoder_outputs = self.encoder(embedding_output,
                                        attention_mask=extended_attention_mask,
                                        head_mask=head_mask,
-                                       encoder_hidden_state=encoder_hidden_state,
+                                       encoder_hidden_states=encoder_hidden_states,
                                        encoder_attention_mask=encoder_attention_mask)
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)

From 87d60b6e19ee1c6d818e6cd5b7a3c4f56f5471ad Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 10:18:19 +0200
Subject: [PATCH 080/144] reword explanation of encoder_attention_mask

---
 transformers/modeling_bert.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 05ab3395de..be8ec5ba21 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -201,9 +201,9 @@ class BertSelfAttention(nn.Module):
     def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
         mixed_query_layer = self.query(hidden_states)
 
-        # if the attention Module is a encoder-decoder self attention module
-        # they keys & values are given by the encoder; the attention mask
-        # needs to be such that there is no atention on the encoder's padding tokens.
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
         if encoder_hidden_states is not None:
             mixed_key_layer = self.key(encoder_hidden_states)
             mixed_value_layer = self.value(encoder_hidden_states)

From c1bc709c3545fbafd7d7d9da01ba89d35aff6a79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 10:41:53 +0200
Subject: [PATCH 081/144] correct the truncation and padding of dataset

---
 examples/run_seq2seq_finetuning.py | 48 +++++++-----------------------
 1 file changed, 10 insertions(+), 38 deletions(-)

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index 2e8d0aa250..32f1782cab 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -104,9 +104,11 @@ class TextDataset(Dataset):
                     except IndexError:  # skip ill-formed stories
                         continue
 
-                story = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(story))
                 summary = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(summary))
-                story_seq, summary_seq = _fit_to_block_size(story, summary, block_size)
+                summary_seq = _fit_to_block_size(summary, block_size)
+
+                story = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(story))
+                story_seq = _fit_to_block_size(story, block_size)
 
                 self.examples.append(
                     tokenizer.add_special_token_sequence_pair(story_seq, summary_seq)
@@ -170,45 +172,15 @@ def _add_missing_period(line):
     return line + "."
 
 
-def _fit_to_block_size(src_sequence, tgt_sequence, block_size):
+def _fit_to_block_size(sequence, block_size):
     """ Adapt the source and target sequences' lengths to the block size.
-
-    If the concatenated sequence (source + target + 3 special tokens) would be
-    longer than the block size we use the 75% / 25% rule followed in [1]. For a
-    block size of 512 this means limiting the source sequence's length to 384
-    and the target sequence's length to 128.
-
-    Attributes:
-        src_sequence (list): a list of ids that maps to the tokens of the
-            source sequence.
-        tgt_sequence (list): a list of ids that maps to the tokens of the
-            target sequence.
-        block_size (int): the model's block size.
-
-    [1] Dong, Li, et al. "Unified Language Model Pre-training for Natural
-    Language Understanding and Generation." arXiv preprint arXiv:1905.03197 (2019).
+    If the sequence is shorter than the block size we pad it with -1 ids
+    which correspond to padding tokens.
     """
-    SRC_MAX_LENGTH = int(0.75 * block_size) - 2  # CLS and EOS token
-    TGT_MAX_LENGTH = block_size - (SRC_MAX_LENGTH + 2) - 1  # EOS token
-
-    # We dump the examples that are too small to fit in the block size for the
-    # sake of simplicity. You can modify this by adding model-specific padding.
-    if len(src_sequence) + len(tgt_sequence) + 3 < block_size:
-        return None
-
-    if len(src_sequence) > SRC_MAX_LENGTH:
-        if len(tgt_sequence) > TGT_MAX_LENGTH:
-            src_sequence = src_sequence[:SRC_MAX_LENGTH]
-            tgt_sequence = tgt_sequence[:TGT_MAX_LENGTH]
-        else:
-            remain_size = block_size - len(tgt_sequence) - 3
-            src_sequence = src_sequence[:remain_size]
+    if len(sequence) > block_size:
+        return sequence[:block_size]
     else:
-        if len(tgt_sequence) > TGT_MAX_LENGTH:
-            remain_size = block_size - len(src_sequence) - 3
-            tgt_sequence = tgt_sequence[:remain_size]
-
-    return src_sequence, tgt_sequence
+        return sequence.extend([-1] * [block_size - len(sequence)])
 
 
 def load_and_cache_examples(args, tokenizer):

From bfb9b540d408fd7f0592f321157fe0371c930c5e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 12:59:51 +0200
Subject: [PATCH 082/144] add Model2Model to __init__

---
 examples/run_seq2seq_finetuning.py | 18 ++----------------
 transformers/__init__.py           |  1 +
 2 files changed, 3 insertions(+), 16 deletions(-)

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index 32f1782cab..94b29c3cd6 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -13,22 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning seq2seq models for sequence generation.
-
-We use the procedure described in [1] to finetune models for sequence
-generation. Let S1 and S2 be the source and target sequence respectively; we
-pack them using the start of sequence [EOS] and end of sequence [EOS] token:
-
-    [CLS] S1 [EOS] S2 [EOS]
-
-We then mask a fixed percentage of token from S2 at random and learn to predict
-the masked words. [EOS] can be masked during finetuning so the model learns to
-terminate the generation process.
-
-[1] Dong Li, Nan Yang, Wenhui Wang, Furu Wei, Xiaodong Liu, Yu Wang, Jianfeng
-Gao, Ming Zhou, and Hsiao-Wuen Hon.  “Unified Language Model Pre-Training for
-Natural Language Understanding and Generation.” (May 2019) ArXiv:1905.03197
-"""
+""" Finetuning seq2seq models for sequence generation."""
 
 import argparse
 from collections import deque
@@ -56,6 +41,7 @@ def set_seed(args):
 # Load dataset
 # ------------
 
+
 class TextDataset(Dataset):
     """ Abstracts the dataset used to train seq2seq models.
 
diff --git a/transformers/__init__.py b/transformers/__init__.py
index 5248bc9f1b..ee8e812a23 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -87,6 +87,7 @@ if is_torch_available():
     from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
                                 DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
                                 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
+    from .modeling_seq2seq import Model2Model
 
     # Optimization
     from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,

From 47a06d88a00c59ea1fb54e92178b3f5d2e8e8973 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 13:04:26 +0200
Subject: [PATCH 083/144] use two different tokenizers for storyand summary

---
 examples/run_seq2seq_finetuning.py | 54 ++++++++++++++++++++----------
 1 file changed, 36 insertions(+), 18 deletions(-)

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index 94b29c3cd6..3e3cc34cb8 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -26,7 +26,7 @@ import numpy as np
 import torch
 from torch.utils.data import Dataset
 
-from transformers import BertTokenizer
+from transformers import AutoTokenizer, Model2Model
 
 logger = logging.getLogger(__name__)
 
@@ -57,7 +57,7 @@ class TextDataset(Dataset):
     [2] https://github.com/abisee/cnn-dailymail/
     """
 
-    def __init_(self, tokenizer, data_dir="", block_size=512):
+    def __init_(self, tokenizer_src, tokenizer_tgt, data_dir="", block_size=512):
         assert os.path.isdir(data_dir)
 
         # Load features that have already been computed if present
@@ -90,15 +90,13 @@ class TextDataset(Dataset):
                     except IndexError:  # skip ill-formed stories
                         continue
 
-                summary = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(summary))
-                summary_seq = _fit_to_block_size(summary, block_size)
-
-                story = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(story))
+                story = tokenizer_src.convert_tokens_to_ids(tokenizer_src.tokenize(story))
                 story_seq = _fit_to_block_size(story, block_size)
 
-                self.examples.append(
-                    tokenizer.add_special_token_sequence_pair(story_seq, summary_seq)
-                )
+                summary = tokenizer_tgt.convert_tokens_to_ids(tokenizer_tgt.tokenize(summary))
+                summary_seq = _fit_to_block_size(summary, block_size)
+
+                self.examples.append((story_seq, summary_seq))
 
         logger.info("Saving features into cache file %s", cached_features_file)
         with open(cached_features_file, "wb") as sink:
@@ -169,8 +167,8 @@ def _fit_to_block_size(sequence, block_size):
         return sequence.extend([-1] * [block_size - len(sequence)])
 
 
-def load_and_cache_examples(args, tokenizer):
-    dataset = TextDataset(tokenizer, file_path=args.data_dir)
+def load_and_cache_examples(args, tokenizer_src, tokenizer_tgt):
+    dataset = TextDataset(tokenizer_src, tokenizer_tgt, file_path=args.data_dir)
     return dataset
 
 
@@ -205,14 +203,35 @@ def main():
 
     # Optional parameters
     parser.add_argument(
-        "--model_name_or_path",
+        "--decoder_name_or_path",
         default="bert-base-cased",
         type=str,
-        help="The model checkpoint for weights initialization.",
+        help="The model checkpoint to initialize the decoder's weights with.",
+    )
+    parser.add_argument(
+        "--decoder_type",
+        default="bert",
+        type=str,
+        help="The decoder architecture to be fine-tuned.",
+    )
+    parser.add_argument(
+        "--encoder_name_or_path",
+        default="bert-base-cased",
+        type=str,
+        help="The model checkpoint to initialize the encoder's weights with.",
+    )
+    parser.add_argument(
+        "--encoder_type",
+        default="bert",
+        type=str,
+        help="The encoder architecture to be fine-tuned.",
     )
     parser.add_argument("--seed", default=42, type=int)
     args = parser.parse_args()
 
+    if args.encoder_type != 'bert' or args.decoder_type != 'bert':
+        raise ValueError("Only the BERT architecture is currently supported for seq2seq.")
+
     # Set up training device
     # device = torch.device("cpu")
 
@@ -220,16 +239,15 @@ def main():
     set_seed(args)
 
     # Load pretrained model and tokenizer
-    tokenizer_class = BertTokenizer
-    # config = config_class.from_pretrained(args.model_name_or_path)
-    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
-    # model = model_class.from_pretrained(args.model_name_or_path, config=config)
+    encoder_tokenizer_class = AutoTokenizer.from_pretrained(args.encoder_name_or_path)
+    decoder_tokenizer_class = AutoTokenizer.from_pretrained(args.decoder_name_or_path)
+    model = Model2Model.from_pretrained(args.encoder_name_or_path, args.decoder_name_or_path)
     # model.to(device)
 
     logger.info("Training/evaluation parameters %s", args)
 
     # Training
-    _ = load_and_cache_examples(args, tokenizer)
+    source, target = load_and_cache_examples(args, tokenizer)
     # global_step, tr_loss = train(args, train_dataset, model, tokenizer)
     # logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 

From 578d23e06114bbd63cf5e931e0fdef9b8b6ac8c4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 14:02:27 +0200
Subject: [PATCH 084/144] add training pipeline (formatting temporary)

---
 examples/run_seq2seq_finetuning.py | 139 +++++++++++++++++++++++++++--
 1 file changed, 130 insertions(+), 9 deletions(-)

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index 3e3cc34cb8..ad6d126165 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -23,8 +23,9 @@ import random
 import os
 
 import numpy as np
+from tqdm import tqdm, trange
 import torch
-from torch.utils.data import Dataset
+from torch.utils.data import Dataset, RandomSampler
 
 from transformers import AutoTokenizer, Model2Model
 
@@ -90,10 +91,14 @@ class TextDataset(Dataset):
                     except IndexError:  # skip ill-formed stories
                         continue
 
-                story = tokenizer_src.convert_tokens_to_ids(tokenizer_src.tokenize(story))
+                story = tokenizer_src.convert_tokens_to_ids(
+                    tokenizer_src.tokenize(story)
+                )
                 story_seq = _fit_to_block_size(story, block_size)
 
-                summary = tokenizer_tgt.convert_tokens_to_ids(tokenizer_tgt.tokenize(summary))
+                summary = tokenizer_tgt.convert_tokens_to_ids(
+                    tokenizer_tgt.tokenize(summary)
+                )
                 summary_seq = _fit_to_block_size(summary, block_size)
 
                 self.examples.append((story_seq, summary_seq))
@@ -179,7 +184,89 @@ def load_and_cache_examples(args, tokenizer_src, tokenizer_tgt):
 
 def train(args, train_dataset, model, tokenizer):
     """ Fine-tune the pretrained model on the corpus. """
-    raise NotImplementedError
+
+    # Prepare the data loading
+    args.train_bach_size = 1
+    train_sampler = RandomSampler(train_dataset)
+    train_dataloader = DataLoader(
+        train_dataset, sampler=train_sampler, batch_size=args.train_bach_size
+    )
+
+    # Prepare the optimizer and schedule (linear warmup and decay)
+    no_decay = ["bias", "LayerNorm.weight"]
+    optimizer_grouped_parameters = [
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if not any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": args.weight_decay,
+        },
+        {
+            "params": [
+                p
+                for n, p in model.named_parameters()
+                if any(nd in n for nd in no_decay)
+            ],
+            "weight_decay": 0.0,
+        },
+    ]
+    optimizer = AdamW(
+        optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon
+    )
+    scheduler = WarmupLinearSchedule(
+        optimizer, warmup_steps=args.warmup_steps, t_total=t_total
+    )
+
+    # Train
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info(
+        "  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size
+    )
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size
+        * args.gradient_accumulation_steps
+        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(args.num_train_epochs, desc="Epoch", disable=True)
+    set_seed(args)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True)
+        for step, batch in enumerate(epoch_iterator):
+            source = ([s for s, _ in batch]).to(args.device)
+            target = ([t for _, t in batch]).to(args.device)
+            model.train()
+            outputs = model(source, target)
+            loss = outputs[0]
+            loss.backward()
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                optimizer.step()
+                scheduler.step()
+                model.zero_grad()
+                global_step += 1
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    return global_step, tr_loss / global_step
 
 
 def main():
@@ -202,6 +289,9 @@ def main():
     )
 
     # Optional parameters
+    parser.add_argument(
+        "--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer."
+    )
     parser.add_argument(
         "--decoder_name_or_path",
         default="bert-base-cased",
@@ -226,11 +316,40 @@ def main():
         type=str,
         help="The encoder architecture to be fine-tuned.",
     )
+    parser.add_argument(
+        "--learning_rate",
+        default=5e-5,
+        type=float,
+        help="The initial learning rate for Adam.",
+    )
+    parser.add_argument(
+        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        default=1,
+        type=int,
+        help="Total number of training epochs to perform.",
+    )
     parser.add_argument("--seed", default=42, type=int)
+    parser.add_argument(
+        "--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps."
+    )
+    parser.add_argument(
+        "--weight_decay", default=0.0, type=float, help="Weight deay if we apply some."
+    )
     args = parser.parse_args()
 
-    if args.encoder_type != 'bert' or args.decoder_type != 'bert':
-        raise ValueError("Only the BERT architecture is currently supported for seq2seq.")
+    if args.encoder_type != "bert" or args.decoder_type != "bert":
+        raise ValueError(
+            "Only the BERT architecture is currently supported for seq2seq."
+        )
 
     # Set up training device
     # device = torch.device("cpu")
@@ -241,14 +360,16 @@ def main():
     # Load pretrained model and tokenizer
     encoder_tokenizer_class = AutoTokenizer.from_pretrained(args.encoder_name_or_path)
     decoder_tokenizer_class = AutoTokenizer.from_pretrained(args.decoder_name_or_path)
-    model = Model2Model.from_pretrained(args.encoder_name_or_path, args.decoder_name_or_path)
+    model = Model2Model.from_pretrained(
+        args.encoder_name_or_path, args.decoder_name_or_path
+    )
     # model.to(device)
 
     logger.info("Training/evaluation parameters %s", args)
 
     # Training
-    source, target = load_and_cache_examples(args, tokenizer)
-    # global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+    train_dataset = load_and_cache_examples(args, tokenizer)
+    global_step, tr_loss = train(args, train_dataset, model, tokenizer)
     # logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
 
 

From 8cd56e30363fc00d947992ae412551f1775a5cfa Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 17 Oct 2019 16:33:26 +0200
Subject: [PATCH 085/144] fix data processing in script

---
 examples/run_seq2seq_finetuning.py | 49 +++++++++---------------------
 1 file changed, 15 insertions(+), 34 deletions(-)

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index ad6d126165..38dcb2d005 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -58,12 +58,12 @@ class TextDataset(Dataset):
     [2] https://github.com/abisee/cnn-dailymail/
     """
 
-    def __init_(self, tokenizer_src, tokenizer_tgt, data_dir="", block_size=512):
+    def __init__(self, tokenizer, prefix='train', data_dir="", block_size=512):
         assert os.path.isdir(data_dir)
 
         # Load features that have already been computed if present
         cached_features_file = os.path.join(
-            data_dir, "cached_lm_{}_{}".format(block_size, data_dir)
+            data_dir, "cached_lm_{}_{}".format(block_size, prefix)
         )
         if os.path.exists(cached_features_file):
             logger.info("Loading features from cached file %s", cached_features_file)
@@ -72,7 +72,7 @@ class TextDataset(Dataset):
                 return
 
         logger.info("Creating features from dataset at %s", data_dir)
-
+        self.examples = []
         datasets = ["cnn", "dailymail"]
         for dataset in datasets:
             path_to_stories = os.path.join(data_dir, dataset, "stories")
@@ -91,21 +91,17 @@ class TextDataset(Dataset):
                     except IndexError:  # skip ill-formed stories
                         continue
 
-                story = tokenizer_src.convert_tokens_to_ids(
-                    tokenizer_src.tokenize(story)
-                )
+                story = tokenizer.encode(story)
                 story_seq = _fit_to_block_size(story, block_size)
 
-                summary = tokenizer_tgt.convert_tokens_to_ids(
-                    tokenizer_tgt.tokenize(summary)
-                )
+                summary = tokenizer.encode(summary)
                 summary_seq = _fit_to_block_size(summary, block_size)
 
                 self.examples.append((story_seq, summary_seq))
 
         logger.info("Saving features into cache file %s", cached_features_file)
         with open(cached_features_file, "wb") as sink:
-            pickle.dump(self.examples, sink, protocole=pickle.HIGHEST_PROTOCOL)
+            pickle.dump(self.examples, sink, protocol=pickle.HIGHEST_PROTOCOL)
 
     def __len__(self):
         return len(self.examples)
@@ -169,11 +165,11 @@ def _fit_to_block_size(sequence, block_size):
     if len(sequence) > block_size:
         return sequence[:block_size]
     else:
-        return sequence.extend([-1] * [block_size - len(sequence)])
+        return sequence.extend([-1] * (block_size - len(sequence)))
 
 
-def load_and_cache_examples(args, tokenizer_src, tokenizer_tgt):
-    dataset = TextDataset(tokenizer_src, tokenizer_tgt, file_path=args.data_dir)
+def load_and_cache_examples(args, tokenizer):
+    dataset = TextDataset(tokenizer, data_dir=args.data_dir)
     return dataset
 
 
@@ -293,29 +289,17 @@ def main():
         "--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer."
     )
     parser.add_argument(
-        "--decoder_name_or_path",
+        "--model_name_or_path",
         default="bert-base-cased",
         type=str,
-        help="The model checkpoint to initialize the decoder's weights with.",
+        help="The model checkpoint to initialize the encoder and decoder's weights with.",
     )
     parser.add_argument(
-        "--decoder_type",
+        "--model_type",
         default="bert",
         type=str,
         help="The decoder architecture to be fine-tuned.",
     )
-    parser.add_argument(
-        "--encoder_name_or_path",
-        default="bert-base-cased",
-        type=str,
-        help="The model checkpoint to initialize the encoder's weights with.",
-    )
-    parser.add_argument(
-        "--encoder_type",
-        default="bert",
-        type=str,
-        help="The encoder architecture to be fine-tuned.",
-    )
     parser.add_argument(
         "--learning_rate",
         default=5e-5,
@@ -346,7 +330,7 @@ def main():
     )
     args = parser.parse_args()
 
-    if args.encoder_type != "bert" or args.decoder_type != "bert":
+    if args.model_type != "bert":
         raise ValueError(
             "Only the BERT architecture is currently supported for seq2seq."
         )
@@ -358,11 +342,8 @@ def main():
     set_seed(args)
 
     # Load pretrained model and tokenizer
-    encoder_tokenizer_class = AutoTokenizer.from_pretrained(args.encoder_name_or_path)
-    decoder_tokenizer_class = AutoTokenizer.from_pretrained(args.decoder_name_or_path)
-    model = Model2Model.from_pretrained(
-        args.encoder_name_or_path, args.decoder_name_or_path
-    )
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    model = Model2Model.from_pretrained(args.model_name_or_path)
     # model.to(device)
 
     logger.info("Training/evaluation parameters %s", args)

From 56e2ee4eadc482a31ca46c97c3cc236824869510 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 17 Oct 2019 16:33:31 +0200
Subject: [PATCH 086/144] fix model2model

---
 transformers/modeling_seq2seq.py | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
index cc5cc53bc3..ca3b9dc87a 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_seq2seq.py
@@ -28,7 +28,7 @@ from .modeling_utils import PreTrainedModel, SequenceSummary
 logger = logging.getLogger(__name__)
 
 
-class PreTrainedSeq2seq(PreTrainedModel):
+class PreTrainedSeq2seq(nn.Module):
     r"""
         :class:`~transformers.Seq2seq` is a generic model class that will be
         instantiated as a Seq2seq model with one of the base model classes of
@@ -43,7 +43,7 @@ class PreTrainedSeq2seq(PreTrainedModel):
         self.decoder = decoder
 
     @classmethod
-    def from_pretrained(cls, encoder_pretrained_model_name_or_path, decoder_pretrained_model_name_or_path, *model_args, **kwargs):
+    def from_pretrained(cls, encoder_pretrained_model_name_or_path=None, decoder_pretrained_model_name_or_path=None, *model_args, **kwargs):
         r""" Instantiates an encoder and a decoder from one or two base classes
         of the library from pre-trained model checkpoints.
 
@@ -177,8 +177,8 @@ class PreTrainedSeq2seq(PreTrainedModel):
 
 
 class Model2Model(PreTrainedSeq2seq):
-    def __init__(self):
-        super(Model2Model, self).__init__()
+    def __init__(self, *args, **kwargs):
+        super(Model2Model, self).__init__(*args, **kwargs)
         self.tie_weights()
 
     def tie_weights(self):
@@ -197,7 +197,14 @@ class Model2Model(PreTrainedSeq2seq):
         by a model-specific keyword (bert, )...
         """
         # self._tie_or_clone_weights(self.encoder, self.decoder)
-        raise NotImplementedError
+        pass
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
+        model = super(Model2Model, cls).from_pretrained(encoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
+                                                        decoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
+                                                        **kwargs)
+        return model
 
 
 class Model2LSTM(PreTrainedSeq2seq):

From ecd15667f36ddac60bb3d26c56b6d835e1d007ec Mon Sep 17 00:00:00 2001
From: leo-du <dul2@cs.washington.edu>
Date: Thu, 17 Oct 2019 11:04:34 -0700
Subject: [PATCH 087/144] fix repetition penalty

---
 examples/run_generation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_generation.py b/examples/run_generation.py
index 13685c946c..ef58cfd844 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -139,7 +139,7 @@ def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=
             next_token_logits = outputs[0][0, -1, :] / (temperature if temperature > 0 else 1.)
 
             # reptition penalty from CTRL (https://arxiv.org/abs/1909.05858)
-            for _ in set(generated):
+            for _ in set(generated.view(-1).tolist()):
                 next_token_logits[_] /= repetition_penalty
                 
             filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)

From fd97761c5a977fd22df789d2851cf57c7c9c0930 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Thu, 17 Oct 2019 15:28:58 -0400
Subject: [PATCH 088/144] soft launch distilroberta

---
 transformers/configuration_roberta.py | 1 +
 transformers/modeling_roberta.py      | 1 +
 transformers/modeling_tf_roberta.py   | 1 +
 transformers/tokenization_roberta.py  | 3 +++
 4 files changed, 6 insertions(+)

diff --git a/transformers/configuration_roberta.py b/transformers/configuration_roberta.py
index b92d6a908b..367a85211d 100644
--- a/transformers/configuration_roberta.py
+++ b/transformers/configuration_roberta.py
@@ -28,6 +28,7 @@ ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
     'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
     'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
+    'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-config.json",
 }
 
 
diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py
index 4ea0800e39..eb340dc7fb 100644
--- a/transformers/modeling_roberta.py
+++ b/transformers/modeling_roberta.py
@@ -34,6 +34,7 @@ ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
     'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
     'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
+    'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-pytorch_model.bin",
 }
 
 class RobertaEmbeddings(BertEmbeddings):
diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py
index 83c1db0495..244c83f2b3 100644
--- a/transformers/modeling_tf_roberta.py
+++ b/transformers/modeling_tf_roberta.py
@@ -35,6 +35,7 @@ TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-tf_model.h5",
     'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-tf_model.h5",
     'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-tf_model.h5",
+    'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-tf_model.h5",
 }
 
 class TFRobertaEmbeddings(TFBertEmbeddings):
diff --git a/transformers/tokenization_roberta.py b/transformers/tokenization_roberta.py
index 9cc8a9af6e..5e1300fa4d 100644
--- a/transformers/tokenization_roberta.py
+++ b/transformers/tokenization_roberta.py
@@ -46,12 +46,14 @@ PRETRAINED_VOCAB_FILES_MAP = {
         'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
         'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
         'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
+        'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-vocab.json",
     },
     'merges_file':
     {
         'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
         'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
         'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
+        'distilroberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/distilroberta-base-merges.txt",
     },
 }
 
@@ -59,6 +61,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'roberta-base': 512,
     'roberta-large': 512,
     'roberta-large-mnli': 512,
+    'distilroberta-base': 512,
 }
 
 

From 8efc0ec91a398f051dfdbff9c64a042d4836ae2b Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Fri, 18 Oct 2019 10:45:44 -0400
Subject: [PATCH 089/144] Add Benchmarks to issue templates

---
 .github/ISSUE_TEMPLATE/---new-benchmark.md | 22 ++++++++++++++++++++++
 1 file changed, 22 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/---new-benchmark.md

diff --git a/.github/ISSUE_TEMPLATE/---new-benchmark.md b/.github/ISSUE_TEMPLATE/---new-benchmark.md
new file mode 100644
index 0000000000..4526bd076a
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/---new-benchmark.md
@@ -0,0 +1,22 @@
+---
+name: "\U0001F5A5 New Benchmark"
+about: You benchmark a part of this library and would like to share your results
+title: "[Benchmark]"
+labels: ''
+assignees: ''
+
+---
+
+# Benchmarking Transformers
+
+## Benchmark
+
+Which part of Transformers did you benchmark?
+
+## Set-up
+
+What did you run your benchmarks on? Please include details, such as: CPU, GPU? If using multiple GPUs, which parallelization did you use?
+
+## Results
+
+Put your results here!

From 7dd29ed2f192946a297fc81a957ae68d2a4c2944 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 18 Oct 2019 10:53:04 -0400
Subject: [PATCH 090/144] Benchmarks example script

---
 examples/benchmarks.py | 460 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 460 insertions(+)
 create mode 100644 examples/benchmarks.py

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
new file mode 100644
index 0000000000..b1153bf566
--- /dev/null
+++ b/examples/benchmarks.py
@@ -0,0 +1,460 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Benchmarking the library on inference and training """
+import tensorflow as tf
+
+# If checking the tensors placement
+# tf.debugging.set_log_device_placement(True)
+
+from typing import List
+import timeit
+from transformers import is_tf_available, is_torch_available
+from time import time
+import torch
+
+import argparse
+import csv
+
+if not is_torch_available() or not is_tf_available():
+    raise ImportError("TensorFlow and Pytorch should be installed on the system.")
+
+from transformers import AutoConfig, AutoModel, AutoTokenizer, TFAutoModel
+
+input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as 
+the Director of Hatcheries and Conditioning entered the room, in the 
+
+
+
+scarcely breathing silence, the absent-minded, soliloquizing hum or 
+whistle, of absorbed concentration. A troop of newly arrived students, 
+very young, pink and callow, followed nervously, rather abjectly, at the 
+Director's heels. Each of them carried a notebook, in which, whenever 
+the great man spoke, he desperately scribbled. Straight from the 
+horse's mouth. It was a rare privilege. The D. H. C. for Central London 
+always made a point of personally conducting his new students round 
+the various departments. 
+
+"Just to give you a general idea," he would explain to them. For of 
+course some sort of general idea they must have, if they were to do 
+their work intelligently-though as little of one, if they were to be good 
+and happy members of society, as possible. For particulars, as every 
+one knows, make for virtue and happiness; generalities are intellectu- 
+ally necessary evils. Not philosophers but fret-sawyers and stamp col- 
+lectors compose the backbone of society. 
+
+"To-morrow," he would add, smiling at them with a slightly menacing 
+geniality, "you'll be settling down to serious work. You won't have time 
+for generalities. Meanwhile ..." 
+
+Meanwhile, it was a privilege. Straight from the horse's mouth into the 
+notebook. The boys scribbled like mad. 
+
+Tall and rather thin but upright, the Director advanced into the room. 
+He had a long chin and big rather prominent teeth, just covered, when 
+he was not talking, by his full, floridly curved lips. Old, young? Thirty? 
+Fifty? Fifty-five? It was hard to say. And anyhow the question didn't 
+arise; in this year of stability, A. F. 632, it didn't occur to you to ask it. 
+
+"I shall begin at the beginning," said the D.H.C. and the more zealous 
+students recorded his intention in their notebooks: Begin at the begin- 
+ning. "These," he waved his hand, "are the incubators." And opening 
+an insulated door he showed them racks upon racks of numbered test- 
+tubes. "The week's supply of ova. Kept," he explained, "at blood heat; 
+whereas the male gametes," and here he opened another door, "they 
+have to be kept at thirty-five instead of thirty-seven. Full blood heat 
+sterilizes." Rams wrapped in theremogene beget no lambs. 
+
+Still leaning against the incubators he gave them, while the pencils 
+scurried illegibly across the pages, a brief description of the modern 
+
+
+
+fertilizing process; spoke first, of course, of its surgical introduc- 
+tion-"the operation undergone voluntarily for the good of Society, not 
+to mention the fact that it carries a bonus amounting to six months' 
+salary"; continued with some account of the technique for preserving 
+the excised ovary alive and actively developing; passed on to a consid- 
+eration of optimum temperature, salinity, viscosity; referred to the liq- 
+uor in which the detached and ripened eggs were kept; and, leading 
+his charges to the work tables, actually showed them how this liquor 
+was drawn off from the test-tubes; how it was let out drop by drop 
+onto the specially warmed slides of the microscopes; how the eggs 
+which it contained were inspected for abnormalities, counted and 
+transferred to a porous receptacle; how (and he now took them to 
+watch the operation) this receptacle was immersed in a warm bouillon 
+containing free-swimming spermatozoa-at a minimum concentration 
+of one hundred thousand per cubic centimetre, he insisted; and how, 
+after ten minutes, the container was lifted out of the liquor and its 
+contents re-examined; how, if any of the eggs remained unfertilized, it 
+was again immersed, and, if necessary, yet again; how the fertilized 
+ova went back to the incubators; where the Alphas and Betas re- 
+mained until definitely bottled; while the Gammas, Deltas and Epsilons 
+were brought out again, after only thirty-six hours, to undergo Bo- 
+kanovsky's Process. 
+
+"Bokanovsky's Process," repeated the Director, and the students un- 
+derlined the words in their little notebooks. 
+
+One egg, one embryo, one adult-normality. But a bokanovskified egg 
+will bud, will proliferate, will divide. From eight to ninety-six buds, and 
+every bud will grow into a perfectly formed embryo, and every embryo 
+into a full-sized adult. Making ninety-six human beings grow where 
+only one grew before. Progress. 
+
+"Essentially," the D.H.C. concluded, "bokanovskification consists of a 
+series of arrests of development. We check the normal growth and, 
+paradoxically enough, the egg responds by budding." 
+
+Responds by budding. The pencils were busy. 
+
+He pointed. On a very slowly moving band a rack-full of test-tubes was 
+entering a large metal box, another, rack-full was emerging. Machinery 
+faintly purred. It took eight minutes for the tubes to go through, he 
+
+
+
+told them. Eight minutes of hard X-rays being about as much as an 
+egg can stand. A few died; of the rest, the least susceptible divided 
+into two; most put out four buds; some eight; all were returned to the 
+incubators, where the buds began to develop; then, after two days, 
+were suddenly chilled, chilled and checked. Two, four, eight, the buds 
+in their turn budded; and having budded were dosed almost to death 
+with alcohol; consequently burgeoned again and having budded-bud 
+out of bud out of bud-were thereafter-further arrest being generally 
+fatal-left to develop in peace. By which time the original egg was in a 
+fair way to becoming anything from eight to ninety-six embryos- a 
+prodigious improvement, you will agree, on nature. Identical twins-but 
+not in piddling twos and threes as in the old viviparous days, when an 
+egg would sometimes accidentally divide; actually by dozens, by 
+scores at a time. 
+
+"Scores," the Director repeated and flung out his arms, as though he 
+were distributing largesse. "Scores." 
+
+But one of the students was fool enough to ask where the advantage 
+lay. 
+
+"My good boy!" The Director wheeled sharply round on him. "Can't you 
+see? Can't you see?" He raised a hand; his expression was solemn. 
+"Bokanovsky's Process is one of the major instruments of social stabil- 
+ity!" 
+
+Major instruments of social stability. 
+
+Standard men and women; in uniform batches. The whole of a small 
+factory staffed with the products of a single bokanovskified egg. 
+
+"Ninety-six identical twins working ninety-six identical machines!" The 
+voice was almost tremulous with enthusiasm. "You really know where 
+you are. For the first time in history." He quoted the planetary motto. 
+"Community, Identity, Stability." Grand words. "If we could bo- 
+kanovskify indefinitely the whole problem would be solved." 
+
+Solved by standard Gammas, unvarying Deltas, uniform Epsilons. Mil- 
+lions of identical twins. The principle of mass production at last applied 
+to biology. 
+
+
+
+"But, alas," the Director shook his head, "we can't bokanovskify indefi- 
+nitely." 
+
+Ninety-six seemed to be the limit; seventy-two a good average. From 
+the same ovary and with gametes of the same male to manufacture as 
+many batches of identical twins as possible-that was the best (sadly a 
+second best) that they could do. And even that was difficult. 
+
+"For in nature it takes thirty years for two hundred eggs to reach ma- 
+turity. But our business is to stabilize the population at this moment, 
+here and now. Dribbling out twins over a quarter of a century-what 
+would be the use of that?" 
+
+Obviously, no use at all. But Podsnap's Technique had immensely ac- 
+celerated the process of ripening. They could make sure of at least a 
+hundred and fifty mature eggs within two years. Fertilize and bo- 
+kanovskify-in other words, multiply by seventy-two-and you get an 
+average of nearly eleven thousand brothers and sisters in a hundred 
+and fifty batches of identical twins, all within two years of the same 
+age. 
+
+"And in exceptional cases we can make one ovary yield us over fifteen 
+thousand adult individuals." 
+
+Beckoning to a fair-haired, ruddy young man who happened to be 
+passing at the moment. "Mr. Foster," he called. The ruddy young man 
+approached. "Can you tell us the record for a single ovary, Mr. Foster?" 
+
+"Sixteen thousand and twelve in this Centre," Mr. Foster replied with- 
+out hesitation. He spoke very quickly, had a vivacious blue eye, and 
+took an evident pleasure in quoting figures. "Sixteen thousand and 
+twelve; in one hundred and eighty-nine batches of identicals. But of 
+course they've done much better," he rattled on, "in some of the tropi- 
+cal Centres. Singapore has often produced over sixteen thousand five 
+hundred; and Mombasa has actually touched the seventeen thousand 
+mark. But then they have unfair advantages. You should see the way a 
+negro ovary responds to pituitary! It's quite astonishing, when you're 
+used to working with European material. Still," he added, with a laugh 
+(but the light of combat was in his eyes and the lift of his chin was 
+challenging), "still, we mean to beat them if we can. I'm working on a 
+wonderful Delta-Minus ovary at this moment. Only just eighteen 
+
+
+
+months old. Over twelve thousand seven hundred children already, ei- 
+ther decanted or in embryo. And still going strong. We'll beat them 
+yet." 
+
+"That's the spirit I like!" cried the Director, and clapped Mr. Foster on 
+the shoulder. "Come along with us, and give these boys the benefit of 
+your expert knowledge." 
+
+Mr. Foster smiled modestly. "With pleasure." They went. 
+In the Bottling Room all was harmonious bustle and ordered activity. 
+Flaps of fresh sow's peritoneum ready cut to the proper size came 
+shooting up in little lifts from the Organ Store in the sub-basement. 
+Whizz and then, click! the lift-hatches hew open; the bottle-liner had 
+only to reach out a hand, take the flap, insert, smooth-down, and be- 
+fore the lined bottle had had time to travel out of reach along the end- 
+less band, whizz, click! another flap of peritoneum had shot up from 
+the depths, ready to be slipped into yet another bottle, the next of that 
+slow interminable procession on the band. 
+
+Next to the Liners stood the Matriculators. The procession advanced; 
+one by one the eggs were transferred from their test-tubes to the 
+larger containers; deftly the peritoneal lining was slit, the morula 
+dropped into place, the saline solution poured in ... and already the 
+bottle had passed, and it was the turn of the labellers. Heredity, date 
+of fertilization, membership of Bokanovsky Group-details were trans- 
+ferred from test-tube to bottle. No longer anonymous, but named, 
+identified, the procession marched slowly on; on through an opening in 
+the wall, slowly on into the Social Predestination Room. 
+"Eighty-eight cubic metres of card-index," said Mr. Foster with relish, 
+as they entered."""
+
+
+def create_setup_and_compute(model_names: List[str],
+                             gpu: bool = True,
+                             tensorflow: bool = False,
+                             average_over: int = 3,
+                             torchscript: bool = False,
+                             xla: bool = False,
+                             save_to_csv: bool = False,
+                             csv_filename: str = f"results_{round(time())}.csv"):
+    if xla:
+        tf.config.optimizer.set_jit(True)
+
+    if tensorflow:
+        dictionary = {model_name: {} for model_name in model_names}
+        results = _compute_tensorflow(model_names, dictionary, average_over)
+    else:
+        device = 'cuda' if (gpu and torch.cuda.is_available()) else 'cpu'
+        dictionary = {model_name: {} for model_name in model_names}
+        results = _compute_pytorch(model_names, dictionary, average_over, device, torchscript)
+
+    print("=========== RESULTS ===========")
+    for model_name in model_names:
+        print("\t" + f"======= MODEL CHECKPOINT: {model_name} =======")
+        for batch_size in results[model_name]["bs"]:
+            print("\t\t" + f"===== BATCH SIZE: {batch_size} =====")
+            for slice_size in results[model_name]["ss"]:
+                result = results[model_name]['results'][batch_size][slice_size]
+                if isinstance(result, str):
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: "
+                          f"{result}")
+                else:
+                    print(f"\t\t{model_name}/{batch_size}/{slice_size}: "
+                          f"{(round(1000 * result) / 1000)}"
+                          f"s")
+
+    if save_to_csv:
+        with open(csv_filename, mode='w') as csv_file:
+            fieldnames = ['model',
+                          '1x8', '1x64', '1x128', '1x256', '1x512', '1x1024',
+                          '2x8', '2x64', '2x128', '2x256', '2x512', '2x1024',
+                          '4x8', '4x64', '4x128', '4x256', '4x512', '4x1024',
+                          '8x8', '8x64', '8x128', '8x256', '8x512', '8x1024',
+                          ]
+
+            writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
+            writer.writeheader()
+
+            for model_name in model_names:
+                model_results = {
+                    f'{bs}x{ss}': results[model_name]['results'][bs][ss]
+                    for bs in results[model_name]["results"]
+                    for ss in results[model_name]['results'][bs]
+                }
+                writer.writerow({'model': model_name, **model_results})
+
+
+def _compute_pytorch(model_names, dictionary, average_over, device, torchscript):
+    for c, model_name in enumerate(model_names):
+        print(f"{c + 1} / {len(model_names)}")
+        config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
+        model = AutoModel.from_pretrained(model_name, config=config)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        tokenized_sequence = tokenizer.encode(input_text)
+
+        max_input_size = tokenizer.max_model_input_sizes[model_name]
+        batch_sizes = [1, 2, 4, 8]
+        slice_sizes = [8, 64, 128, 256, 512, 1024]
+
+        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
+        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
+
+        for batch_size in batch_sizes:
+            model.to(device)
+            model.eval()
+            for slice_size in slice_sizes:
+                if max_input_size is not None and slice_size > max_input_size:
+                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+                else:
+                    sequence = torch.tensor(tokenized_sequence[:slice_size], device=device).repeat(batch_size, 1)
+                    try:
+                        if torchscript:
+                            print("Tracing model with sequence size", sequence.shape)
+                            inference = torch.jit.trace(model, sequence)
+                            inference(sequence)
+                        else:
+                            inference = model
+                            inference(sequence)
+
+                        print("Going through model with sequence of shape", sequence.shape)
+                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
+                        average_time = sum(runtimes)/float(len(runtimes)) / 3.0
+                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
+                    except RuntimeError as e:
+                        print("Doesn't fit on GPU.", e)
+                        torch.cuda.empty_cache()
+                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+    return dictionary
+
+
+def _compute_tensorflow(model_names, dictionary, average_over):
+    for c, model_name in enumerate(model_names):
+        print(f"{c + 1} / {len(model_names)}")
+        config = AutoConfig.from_pretrained(model_name)
+        model = TFAutoModel.from_pretrained(model_name, config=config)
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+
+        tokenized_sequence = tokenizer.encode(input_text)
+
+        max_input_size = tokenizer.max_model_input_sizes[model_name]
+        batch_sizes = [1, 2, 4, 8]
+        slice_sizes = [8, 64, 128, 256, 512, 1024]
+
+        dictionary[model_name] = {"bs": batch_sizes, "ss": slice_sizes, "results": {}}
+        dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
+
+        print("Using model", model)
+
+        @tf.function
+        def inference(inputs):
+            return model(inputs)
+
+        for batch_size in batch_sizes:
+            for slice_size in slice_sizes:
+                if max_input_size is not None and slice_size > max_input_size:
+                    dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+                else:
+                    sequence = tf.stack([tf.squeeze(tf.constant(tokenized_sequence[:slice_size])[None, :])] * batch_size)
+
+                    try:
+                        print("Going through model with sequence of shape", sequence.shape)
+                        # To make sure that the model is traced + that the tensors are on the appropriate device
+                        inference(sequence)
+
+                        runtimes = timeit.repeat(lambda: inference(sequence), repeat=average_over, number=3)
+                        average_time = sum(runtimes)/float(len(runtimes)) / 3.0
+                        dictionary[model_name]["results"][batch_size][slice_size] = average_time
+                    except tf.errors.ResourceExhaustedError as e:
+                        print("Doesn't fit on GPU.", e)
+                        torch.cuda.empty_cache()
+                        dictionary[model_name]["results"][batch_size][slice_size] = "N/A"
+    return dictionary
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--models", required=False, type=str, default='all', help="Model checkpoints to be provided "
+                                                                                  "to the AutoModel classes. Leave "
+                                                                                  "blank to benchmark the base version "
+                                                                                  "of all available model "
+                                                                                  "architectures.")
+    parser.add_argument("--torch", required=False, action="store_true", help="Benchmark the Pytorch version of the "
+                                                                             "models")
+    parser.add_argument("--torch_cuda", required=False, action="store_true", help="Pytorch only: run on available "
+                                                                                  "cuda devices")
+    parser.add_argument("--torchscript", required=False, action="store_true", help="Pytorch only: trace the models "
+                                                                                   "using torchscript")
+    parser.add_argument("--tensorflow", required=False, action="store_true", help="Benchmark the TensorFlow version "
+                                                                                  "of the models. Will run on GPU if "
+                                                                                  "the correct dependencies are "
+                                                                                  "installed")
+    parser.add_argument("--xla", required=False, action="store_true", help="TensorFlow only: use XLA acceleration.")
+    parser.add_argument("--keras_predict", required=False, action="store_true", help="Whether to use model.predict "
+                                                                                     "instead of model() to do a "
+                                                                                     "forward pass.")
+    parser.add_argument("--save_to_csv", required=False, action="store_true", help="Save to a CSV file.")
+    parser.add_argument("--csv_filename", required=False, default=None, help="CSV filename used if saving results to csv.")
+    parser.add_argument("--average_over", required=False, default=30, type=int, help="Times an experiment will be run.")
+
+    args = parser.parse_args()
+    if args.models == 'all':
+        args.models = [
+            "gpt2",
+            "bert-base-cased",
+            "xlnet-base-cased",
+            "xlm-mlm-en-2048",
+            "transfo-xl-wt103",
+            "openai-gpt",
+            "distilbert-base-uncased",
+            "distilgpt2",
+            "roberta-base",
+            "ctrl"
+        ]
+    else:
+        args.models = args.models.split()
+
+    print("Running with arguments", args)
+
+    if args.torch:
+        create_setup_and_compute(
+            model_names=args.models,
+            tensorflow=False,
+            gpu=args.torch_cuda,
+            torchscript=args.torchscript,
+            save_to_csv=args.save_to_csv,
+            csv_filename=args.csv_filename,
+            average_over=args.average_over
+        )
+
+    if args.tensorflow:
+        create_setup_and_compute(
+            model_names=args.models,
+            tensorflow=True,
+            xla=args.xla,
+            save_to_csv=args.save_to_csv,
+            csv_filename=args.csv_filename,
+            average_over=args.average_over
+        )
+
+
+if __name__ == '__main__':
+    main()
+

From 82f6abd98aaa691ca0adfe21e85a17dc6f386497 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 18 Oct 2019 17:27:10 -0400
Subject: [PATCH 091/144] Benchmark section added to the documentation

---
 docs/source/benchmarks.md | 54 +++++++++++++++++++++++++++++++++++++++
 docs/source/index.rst     |  1 +
 2 files changed, 55 insertions(+)
 create mode 100644 docs/source/benchmarks.md

diff --git a/docs/source/benchmarks.md b/docs/source/benchmarks.md
new file mode 100644
index 0000000000..decbac47b7
--- /dev/null
+++ b/docs/source/benchmarks.md
@@ -0,0 +1,54 @@
+# Benchmarks
+
+This section is dedicated to the Benchmarks done by the library, both by maintainers, contributors and users. These 
+benchmark will help keep track of the preformance improvements that are brought to our models across versions.
+
+## Benchmarking all models for inference
+
+As of version 2.1 we have benchmarked all models for inference, across many different settings: using PyTorch, with
+and without TorchScript, using TensorFlow, with and without XLA. All of those tests were done across CPUs (except for
+TensorFlow XLA) and GPUs.
+
+The approach is detailed in the [following blogpost](https://medium.com/huggingface/benchmarking-transformers-pytorch-and-tensorflow-e2917fb891c2)
+
+The results are available [here](https://docs.google.com/spreadsheets/d/1sryqufw2D0XlUH4sq3e9Wnxu5EAQkaohzrJbd5HdQ_w/edit?usp=sharing).
+
+## TF2 with mixed precision, XLA, Distribution (@tlkh)
+
+This work was done by [Timothy Liu](https://github.com/tlkh).
+
+There are very positive results to be gained from the various TensorFlow 2.0 features:
+
+- Automatic Mixed Precision (AMP)
+- XLA compiler
+- Distribution strategies (multi-GPU)
+
+The benefits are listed here (tested on CoLA, MRPC, SST-2):
+
+- AMP: Between 1.4x to 1.6x decrease in overall time without change in batch size
+- AMP+XLA: Up to 2.5x decrease in overall time on SST-2 (larger dataset)
+- Distribution: Between 1.4x to 3.4x decrease in overall time on 4xV100
+- Combined: Up to 5.7x decrease in overall training time, or 9.1x training throughput
+
+The model quality (measured by the validation accuracy) fluctuates slightly. Taking an average of 4 training runs 
+on a single GPU gives the following results:
+
+- CoLA: AMP results in slighter lower acc (0.820 vs 0.824)
+- MRPC: AMP results in lower acc (0.823 vs 0.835)
+- SST-2: AMP results in slighter lower acc (0.918 vs 0.922)
+
+However, in a distributed setting with 4xV100 (4x batch size), AMP can yield in better results:
+
+CoLA: AMP results in higher acc (0.828 vs 0.812)
+MRPC: AMP results in lower acc (0.817 vs 0.827)
+SST-2: AMP results in slightly lower acc (0.926 vs 0.929)
+
+The benchmark script is available [here](https://github.com/NVAITC/benchmarking/blob/master/tf2/bert_dist.py).
+
+Note: on some tasks (e.g. MRPC), the dataset is too small. The overhead due to the model compilation with XLA as well
+as the distribution strategy setup does not speed things up. The XLA compile time is also the reason why although throughput 
+can increase a lot (e.g. 2.7x for single GPU), overall (end-to-end) training speed-up is not as fast (as low as 1.4x)
+
+The benefits as seen on SST-2 (larger dataset) is much clear.
+
+All results can be seen on this [Google Sheet](https://docs.google.com/spreadsheets/d/1538MN224EzjbRL239sqSiUy6YY-rAjHyXhTzz_Zptls/edit#gid=960868445).
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 7e2c8063fc..4cd1f48ba8 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -63,6 +63,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
     bertology
     torchscript
     multilingual
+    benchmarks
 
 .. toctree::
     :maxdepth: 2

From a2c8c8ef00f6cecfcdc50adc5ee52515c81a5cee Mon Sep 17 00:00:00 2001
From: Ralph Tang <r33tang@uwaterloo.ca>
Date: Sat, 19 Oct 2019 16:19:20 -0400
Subject: [PATCH 092/144] Fix hanging when loading pretrained models

- Fix hanging when loading pretrained models from the cache without having internet access. This is a widespread issue on supercomputers whose internal compute nodes are firewalled.
---
 transformers/file_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/transformers/file_utils.py b/transformers/file_utils.py
index 11c4ba6318..27875212ff 100644
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -246,7 +246,7 @@ def http_get(url, temp_file, proxies=None):
     progress.close()
 
 
-def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
+def get_from_cache(url, cache_dir=None, force_download=False, proxies=None, etag_timeout=10):
     """
     Given a URL, look for the corresponding dataset in the local cache.
     If it's not there, download it. Then return the path to the cached file.
@@ -266,12 +266,12 @@ def get_from_cache(url, cache_dir=None, force_download=False, proxies=None):
         etag = s3_etag(url, proxies=proxies)
     else:
         try:
-            response = requests.head(url, allow_redirects=True, proxies=proxies)
+            response = requests.head(url, allow_redirects=True, proxies=proxies, timeout=etag_timeout)
             if response.status_code != 200:
                 etag = None
             else:
                 etag = response.headers.get("ETag")
-        except EnvironmentError:
+        except (EnvironmentError, requests.exceptions.Timeout):
             etag = None
 
     if sys.version_info[0] == 2 and etag is not None:

From 3775550c4b27e29fac18a545ed87f84c7451aa61 Mon Sep 17 00:00:00 2001
From: Pasquale Minervini <p.minervini@gmail.com>
Date: Sun, 20 Oct 2019 22:33:56 +0100
Subject: [PATCH 093/144] gradient norm clipping should be done right before
 calling the optimiser

---
 examples/run_squad.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 71c656a13d..aaf4952198 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -157,13 +157,16 @@ def train(args, train_dataset, model, tokenizer):
             if args.fp16:
                 with amp.scale_loss(loss, optimizer) as scaled_loss:
                     scaled_loss.backward()
-                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
             else:
                 loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
                 optimizer.step()
                 scheduler.step()  # Update learning rate schedule
                 model.zero_grad()

From 4d456542e9d381090f9a00b2bcc5a4cb07f6f3f7 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 21 Oct 2019 16:34:14 +0200
Subject: [PATCH 094/144] Fix citation

---
 README.md | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index da0de4ae81..ad771f2ab1 100644
--- a/README.md
+++ b/README.md
@@ -549,12 +549,11 @@ for batch in train_data:
 
 We now have a paper you can cite for the 🤗 Transformers library:
 ```
-@misc{wolf2019transformers,
-    title={Transformers: State-of-the-art Natural Language Processing},
-    author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and Rémi Louf and Morgan Funtowicz and Jamie Brew},
-    year={2019},
-    eprint={1910.03771},
-    archivePrefix={arXiv},
-    primaryClass={cs.CL}
+@article{Wolf2019HuggingFacesTS,
+  title={HuggingFace's Transformers: State-of-the-art Natural Language Processing},
+  author={Thomas Wolf and Lysandre Debut and Victor Sanh and Julien Chaumond and Clement Delangue and Anthony Moi and Pierric Cistac and Tim Rault and R'emi Louf and Morgan Funtowicz and Jamie Brew},
+  journal={ArXiv},
+  year={2019},
+  volume={abs/1910.03771}
 }
 ```

From abd7110e21102467448035ffdbf6b208a05ac80b Mon Sep 17 00:00:00 2001
From: Pasquale Minervini <p.minervini@gmail.com>
Date: Mon, 21 Oct 2019 19:56:52 +0100
Subject: [PATCH 095/144] gradient norm clipping should be done right before
 calling the optimiser - fixing run_glue and run_ner as well

---
 examples/run_glue.py | 7 +++++--
 examples/run_ner.py  | 7 +++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 45924c9290..54f6689e4d 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -154,13 +154,16 @@ def train(args, train_dataset, model, tokenizer):
             if args.fp16:
                 with amp.scale_loss(loss, optimizer) as scaled_loss:
                     scaled_loss.backward()
-                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
             else:
                 loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0 and not args.tpu:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
                 optimizer.step()
                 scheduler.step()  # Update learning rate schedule
                 model.zero_grad()
diff --git a/examples/run_ner.py b/examples/run_ner.py
index fdf2f1924a..00eb039258 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -133,13 +133,16 @@ def train(args, train_dataset, model, tokenizer, labels, pad_token_label_id):
             if args.fp16:
                 with amp.scale_loss(loss, optimizer) as scaled_loss:
                     scaled_loss.backward()
-                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
             else:
                 loss.backward()
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
 
             tr_loss += loss.item()
             if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
                 scheduler.step()  # Update learning rate schedule
                 optimizer.step()
                 model.zero_grad()

From 777faa8ae7d9232b3b5ed1d6c7cb11dca3d744c3 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 22 Oct 2019 11:26:42 -0400
Subject: [PATCH 096/144] Fix #1597

---
 transformers/tokenization_ctrl.py | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/transformers/tokenization_ctrl.py b/transformers/tokenization_ctrl.py
index 2406fa256b..c8d67ad043 100644
--- a/transformers/tokenization_ctrl.py
+++ b/transformers/tokenization_ctrl.py
@@ -63,11 +63,7 @@ def get_pairs(word):
 class CTRLTokenizer(PreTrainedTokenizer):
     """
     CTRL BPE tokenizer. Peculiarities:
-        - Byte-level Byte-Pair-Encoding
-        - Requires a space to start the input string => the encoding methods should be called with the
-          ``add_prefix_space`` flag set to ``True``.
-          Otherwise, this tokenizer ``encode`` and ``decode`` method will not conserve
-          the absence of a space at the beginning of a string: `tokenizer.decode(tokenizer.encode("Hello")) = " Hello"`
+        - Byte-Pair-Encoding
     """
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP

From 1cfd9748683db43af2c98da1a19d39f0efc8cc3b Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 22 Oct 2019 13:32:23 -0400
Subject: [PATCH 097/144] Option to benchmark only one of the two libraries

---
 examples/benchmarks.py | 55 ++++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index b1153bf566..d03844697d 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -14,7 +14,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Benchmarking the library on inference and training """
-import tensorflow as tf
 
 # If checking the tensors placement
 # tf.debugging.set_log_device_placement(True)
@@ -23,15 +22,18 @@ from typing import List
 import timeit
 from transformers import is_tf_available, is_torch_available
 from time import time
-import torch
-
 import argparse
 import csv
 
-if not is_torch_available() or not is_tf_available():
-    raise ImportError("TensorFlow and Pytorch should be installed on the system.")
+if is_tf_available():
+    import tensorflow as tf
+    from transformers import TFAutoModel
 
-from transformers import AutoConfig, AutoModel, AutoTokenizer, TFAutoModel
+if is_torch_available():
+    import torch
+    from transformers import AutoModel
+
+from transformers import AutoConfig, AutoTokenizer
 
 input_text = """Bent over their instruments, three hundred Fertilizers were plunged, as 
 the Director of Hatcheries and Conditioning entered the room, in the 
@@ -434,26 +436,31 @@ def main():
     print("Running with arguments", args)
 
     if args.torch:
-        create_setup_and_compute(
-            model_names=args.models,
-            tensorflow=False,
-            gpu=args.torch_cuda,
-            torchscript=args.torchscript,
-            save_to_csv=args.save_to_csv,
-            csv_filename=args.csv_filename,
-            average_over=args.average_over
-        )
+        if is_torch_available():
+            create_setup_and_compute(
+                model_names=args.models,
+                tensorflow=False,
+                gpu=args.torch_cuda,
+                torchscript=args.torchscript,
+                save_to_csv=args.save_to_csv,
+                csv_filename=args.csv_filename,
+                average_over=args.average_over
+            )
+        else:
+            raise ImportError("Trying to run a PyTorch benchmark but PyTorch was not found in the environment.")
 
     if args.tensorflow:
-        create_setup_and_compute(
-            model_names=args.models,
-            tensorflow=True,
-            xla=args.xla,
-            save_to_csv=args.save_to_csv,
-            csv_filename=args.csv_filename,
-            average_over=args.average_over
-        )
-
+        if is_tf_available():
+            create_setup_and_compute(
+                model_names=args.models,
+                tensorflow=True,
+                xla=args.xla,
+                save_to_csv=args.save_to_csv,
+                csv_filename=args.csv_filename,
+                average_over=args.average_over
+            )
+        else:
+            raise ImportError("Trying to run a TensorFlow benchmark but TensorFlow was not found in the environment.")
 
 if __name__ == '__main__':
     main()

From 44286b94d3376f56ee7ef039790d40798d5f9e7d Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 22 Oct 2019 13:46:48 -0400
Subject: [PATCH 098/144] RoBERTa doesn't print a warning when no special
 tokens are passed.

---
 transformers/modeling_roberta.py    | 12 ------------
 transformers/modeling_tf_roberta.py | 16 ----------------
 2 files changed, 28 deletions(-)

diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py
index eb340dc7fb..e15663d017 100644
--- a/transformers/modeling_roberta.py
+++ b/transformers/modeling_roberta.py
@@ -169,18 +169,6 @@ class RobertaModel(BertModel):
         self.embeddings = RobertaEmbeddings(config)
         self.init_weights()
 
-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
-        if input_ids[:, 0].sum().item() != 0:
-            logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
-                           "This model requires special tokens in order to work. "
-                           "Please specify add_special_tokens=True in your tokenize.encode()"
-                           "or tokenizer.convert_tokens_to_ids().")
-        return super(RobertaModel, self).forward(input_ids,
-                                                 attention_mask=attention_mask,
-                                                 token_type_ids=token_type_ids,
-                                                 position_ids=position_ids,
-                                                 head_mask=head_mask)
-
 
 @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
     ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py
index 244c83f2b3..b734f056ab 100644
--- a/transformers/modeling_tf_roberta.py
+++ b/transformers/modeling_tf_roberta.py
@@ -65,22 +65,6 @@ class TFRobertaMainLayer(TFBertMainLayer):
         super(TFRobertaMainLayer, self).__init__(config, **kwargs)
         self.embeddings = TFRobertaEmbeddings(config, name='embeddings')
 
-    def call(self, inputs, **kwargs):
-        # Check that input_ids starts with control token
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-        else:
-            input_ids = inputs
-
-        if tf.not_equal(tf.reduce_sum(input_ids[:, 0]), 0):
-            tf.print("A sequence with no special tokens has been passed to the RoBERTa model. "
-                           "This model requires special tokens in order to work. "
-                           "Please specify add_special_tokens=True in your encoding.")
-
-        return super(TFRobertaMainLayer, self).call(inputs, **kwargs)
-
 
 class TFRobertaPreTrainedModel(TFPreTrainedModel):
     """ An abstract class to handle weights initialization and

From 7d709e55ed54961ce3c84f53f1c14ee4f0c8a2e3 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 22 Oct 2019 14:12:33 -0400
Subject: [PATCH 099/144] Remove

---
 examples/benchmarks.py                        |  4 +--
 .../distillation/scripts/binarized_data.py    |  2 +-
 examples/run_generation.py                    |  2 +-
 transformers/tests/tokenization_bert_test.py  |  4 +--
 .../tests/tokenization_distilbert_test.py     |  4 +--
 .../tests/tokenization_roberta_test.py        |  8 +++---
 .../tests/tokenization_tests_commons.py       | 28 ++++++++++---------
 transformers/tests/tokenization_xlm_test.py   |  4 +--
 transformers/tests/tokenization_xlnet_test.py |  4 +--
 transformers/tokenization_utils.py            | 20 ++++++-------
 10 files changed, 41 insertions(+), 39 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index d03844697d..06f368d946 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -309,7 +309,7 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript)
         model = AutoModel.from_pretrained(model_name, config=config)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
 
-        tokenized_sequence = tokenizer.encode(input_text)
+        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
 
         max_input_size = tokenizer.max_model_input_sizes[model_name]
         batch_sizes = [1, 2, 4, 8]
@@ -353,7 +353,7 @@ def _compute_tensorflow(model_names, dictionary, average_over):
         model = TFAutoModel.from_pretrained(model_name, config=config)
         tokenizer = AutoTokenizer.from_pretrained(model_name)
 
-        tokenized_sequence = tokenizer.encode(input_text)
+        tokenized_sequence = tokenizer.encode(input_text, add_special_tokens=False)
 
         max_input_size = tokenizer.max_model_input_sizes[model_name]
         batch_sizes = [1, 2, 4, 8]
diff --git a/examples/distillation/scripts/binarized_data.py b/examples/distillation/scripts/binarized_data.py
index 43824e9964..681cc2de34 100644
--- a/examples/distillation/scripts/binarized_data.py
+++ b/examples/distillation/scripts/binarized_data.py
@@ -68,7 +68,7 @@ def main():
     start = time.time()
     for text in data:
         text = f'{bos} {text.strip()} {sep}'
-        token_ids = tokenizer.encode(text)
+        token_ids = tokenizer.encode(text, add_special_tokens=False)
         rslt.append(token_ids)
 
         iter += 1
diff --git a/examples/run_generation.py b/examples/run_generation.py
index ef58cfd844..b7907e40da 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -223,7 +223,7 @@ def main():
         if args.model_type in ["transfo-xl", "xlnet"]:
             # Models with memory likes to have a long prompt for short inputs.
             raw_text = (args.padding_text if args.padding_text else PADDING_TEXT) + raw_text
-        context_tokens = tokenizer.encode(raw_text)
+        context_tokens = tokenizer.encode(raw_text, add_special_tokens=False)
         out = sample_sequence(
             model=model,
             context=context_tokens,
diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
index 5e49e2915b..fd61ec30ba 100644
--- a/transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -128,8 +128,8 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("bert-base-uncased")
 
-        text = tokenizer.encode("sequence builders")
-        text_2 = tokenizer.encode("multi-sequence build")
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
 
         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
diff --git a/transformers/tests/tokenization_distilbert_test.py b/transformers/tests/tokenization_distilbert_test.py
index a18d644fe8..e3c8376ca8 100644
--- a/transformers/tests/tokenization_distilbert_test.py
+++ b/transformers/tests/tokenization_distilbert_test.py
@@ -33,8 +33,8 @@ class DistilBertTokenizationTest(BertTokenizationTest):
     def test_sequence_builders(self):
         tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
 
-        text = tokenizer.encode("sequence builders")
-        text_2 = tokenizer.encode("multi-sequence build")
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
 
         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
diff --git a/transformers/tests/tokenization_roberta_test.py b/transformers/tests/tokenization_roberta_test.py
index a731ac26c9..b31dd94f21 100644
--- a/transformers/tests/tokenization_roberta_test.py
+++ b/transformers/tests/tokenization_roberta_test.py
@@ -70,19 +70,19 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
         tokenizer = self.get_tokenizer()
 
         self.assertListEqual(
-            tokenizer.encode('Hello world!'),
+            tokenizer.encode('Hello world!', add_special_tokens=False),
             [0, 31414, 232, 328, 2]
         )
         self.assertListEqual(
-            tokenizer.encode('Hello world! cécé herlolip 418'),
+            tokenizer.encode('Hello world! cécé herlolip 418', add_special_tokens=False),
             [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
         )
 
     def test_sequence_builders(self):
         tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
 
-        text = tokenizer.encode("sequence builders")
-        text_2 = tokenizer.encode("multi-sequence build")
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
 
         encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
         encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index b2801d5f41..a921696b77 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -79,13 +79,13 @@ class CommonTestCases:
             # Now let's start the test
             tokenizer = self.get_tokenizer(max_len=42)
 
-            before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+            before_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
 
             with TemporaryDirectory() as tmpdirname:
                 tokenizer.save_pretrained(tmpdirname)
                 tokenizer = self.tokenizer_class.from_pretrained(tmpdirname)
 
-                after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running")
+                after_tokens = tokenizer.encode(u"He is very happy, UNwant\u00E9d,running", add_special_tokens=False)
                 self.assertListEqual(before_tokens, after_tokens)
 
                 self.assertEqual(tokenizer.max_len, 42)
@@ -130,7 +130,7 @@ class CommonTestCases:
             self.assertEqual(added_toks, len(new_toks))
             self.assertEqual(all_size_2, all_size + len(new_toks))
 
-            tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l")
+            tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False)
             out_string = tokenizer.decode(tokens)
 
             self.assertGreaterEqual(len(tokens), 4)
@@ -148,7 +148,8 @@ class CommonTestCases:
             self.assertEqual(added_toks_2, len(new_toks_2))
             self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
 
-            tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
+            tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
+                                      add_special_tokens=False)
             out_string = tokenizer.decode(tokens)
 
             self.assertGreaterEqual(len(tokens), 6)
@@ -166,7 +167,7 @@ class CommonTestCases:
 
             tokens = tokenizer.tokenize(input_text)
             ids = tokenizer.convert_tokens_to_ids(tokens)
-            ids_2 = tokenizer.encode(input_text)
+            ids_2 = tokenizer.encode(input_text, add_special_tokens=False)
             self.assertListEqual(ids, ids_2)
 
             tokens_2 = tokenizer.convert_ids_to_tokens(ids)
@@ -206,7 +207,7 @@ class CommonTestCases:
             seq_0 = "Test this method."
             seq_1 = "With these inputs."
 
-            sequences = tokenizer.encode(seq_0, seq_1)
+            sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=False)
             attached_sequences = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
 
             # Method is implemented (e.g. not GPT-2)
@@ -219,7 +220,7 @@ class CommonTestCases:
             seq_0 = "This is a sentence to be encoded."
             stride = 2
 
-            sequence = tokenizer.encode(seq_0)
+            sequence = tokenizer.encode(seq_0, add_special_tokens=False)
             num_added_tokens = tokenizer.num_added_tokens()
             total_length = len(sequence) + num_added_tokens
             information = tokenizer.encode_plus(seq_0, max_length=total_length - 2, add_special_tokens=True, stride=stride)
@@ -239,13 +240,13 @@ class CommonTestCases:
             seq_1 = "This is another sentence to be encoded."
             stride = 2
 
-            sequence_0_no_special_tokens = tokenizer.encode(seq_0)
-            sequence_1_no_special_tokens = tokenizer.encode(seq_1)
+            sequence_0_no_special_tokens = tokenizer.encode(seq_0, add_special_tokens=False)
+            sequence_1_no_special_tokens = tokenizer.encode(seq_1, add_special_tokens=False)
 
             sequence = tokenizer.encode(seq_0, seq_1, add_special_tokens=True)
             truncated_second_sequence = tokenizer.build_inputs_with_special_tokens(
-                tokenizer.encode(seq_0),
-                tokenizer.encode(seq_1)[:-2]
+                tokenizer.encode(seq_0, add_special_tokens=False),
+                tokenizer.encode(seq_1, add_special_tokens=False)[:-2]
             )
 
             information = tokenizer.encode_plus(seq_0, seq_1, max_length=len(sequence) - 2, add_special_tokens=True,
@@ -283,7 +284,7 @@ class CommonTestCases:
             sequence_1 = "This one too please."
 
             # Testing single inputs
-            encoded_sequence = tokenizer.encode(sequence_0)
+            encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False)
             encoded_sequence_dict = tokenizer.encode_plus(sequence_0, add_special_tokens=True)
             encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
             special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
@@ -294,7 +295,8 @@ class CommonTestCases:
             self.assertEqual(encoded_sequence, filtered_sequence)
 
             # Testing inputs pairs
-            encoded_sequence = tokenizer.encode(sequence_0) + tokenizer.encode(sequence_1)
+            encoded_sequence = tokenizer.encode(sequence_0, add_special_tokens=False) + tokenizer.encode(sequence_1,
+                                                                                                         add_special_tokens=False)
             encoded_sequence_dict = tokenizer.encode_plus(sequence_0, sequence_1, add_special_tokens=True)
             encoded_sequence_w_special = encoded_sequence_dict["input_ids"]
             special_tokens_mask = encoded_sequence_dict["special_tokens_mask"]
diff --git a/transformers/tests/tokenization_xlm_test.py b/transformers/tests/tokenization_xlm_test.py
index 0949b0cce4..567edf1ccd 100644
--- a/transformers/tests/tokenization_xlm_test.py
+++ b/transformers/tests/tokenization_xlm_test.py
@@ -69,8 +69,8 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
     def test_sequence_builders(self):
         tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
 
-        text = tokenizer.encode("sequence builders")
-        text_2 = tokenizer.encode("multi-sequence build")
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
 
         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
diff --git a/transformers/tests/tokenization_xlnet_test.py b/transformers/tests/tokenization_xlnet_test.py
index 1a5dbcf6df..653968b9af 100644
--- a/transformers/tests/tokenization_xlnet_test.py
+++ b/transformers/tests/tokenization_xlnet_test.py
@@ -92,8 +92,8 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
     def test_sequence_builders(self):
         tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
 
-        text = tokenizer.encode("sequence builders")
-        text_2 = tokenizer.encode("multi-sequence build")
+        text = tokenizer.encode("sequence builders", add_special_tokens=False)
+        text_2 = tokenizer.encode("multi-sequence build", add_special_tokens=False)
 
         encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
         encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 5e5be872ef..ac765165e2 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -689,14 +689,14 @@ class PreTrainedTokenizer(object):
         raise NotImplementedError
 
     def encode(self,
-                text,
-                text_pair=None,
-                add_special_tokens=False,
-                max_length=None,
-                stride=0,
-                truncation_strategy='longest_first',
-                return_tensors=None,
-                **kwargs):
+               text,
+               text_pair=None,
+               add_special_tokens=True,
+               max_length=None,
+               stride=0,
+               truncation_strategy='longest_first',
+               return_tensors=None,
+               **kwargs):
         """
         Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
 
@@ -739,7 +739,7 @@ class PreTrainedTokenizer(object):
     def encode_plus(self,
                     text,
                     text_pair=None,
-                    add_special_tokens=False,
+                    add_special_tokens=True,
                     max_length=None,
                     stride=0,
                     truncation_strategy='longest_first',
@@ -794,7 +794,7 @@ class PreTrainedTokenizer(object):
                                       truncation_strategy=truncation_strategy,
                                       return_tensors=return_tensors)
 
-    def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=False, stride=0,
+    def prepare_for_model(self, ids, pair_ids=None, max_length=None, add_special_tokens=True, stride=0,
                           truncation_strategy='longest_first', return_tensors=None):
         """
         Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model.

From e16d46843a19ab289b82138e4eccec5610a76de7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Juli=C3=A1n=20Peller=20=28dataista=29?=
 <julianpeller@gmail.com>
Date: Tue, 22 Oct 2019 16:11:02 -0300
Subject: [PATCH 100/144] Fix architectures count

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index ad771f2ab1..e8506d6a39 100644
--- a/README.md
+++ b/README.md
@@ -39,7 +39,7 @@ State-of-the-art NLP for everyone
 Lower compute costs, smaller carbon footprint
 - Researchers can share trained models instead of always retraining
 - Practitioners can reduce compute time and production costs
-- 8 architectures with over 30 pretrained models, some in more than 100 languages
+- 10 architectures with over 30 pretrained models, some in more than 100 languages
 
 Choose the right framework for every part of a model's lifetime
 - Train state-of-the-art models in 3 lines of code
@@ -111,7 +111,7 @@ At some point in the future, you'll be able to seamlessly move from pre-training
 
 ## Model architectures
 
-🤗 Transformers currently provides 8 NLU/NLG architectures:
+🤗 Transformers currently provides 10 NLU/NLG architectures:
 
 1. **[BERT](https://github.com/google-research/bert)** (from Google) released with the paper [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805) by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova.
 2. **[GPT](https://github.com/openai/finetune-transformer-lm)** (from OpenAI) released with the paper [Improving Language Understanding by Generative Pre-Training](https://blog.openai.com/language-unsupervised/) by Alec Radford, Karthik Narasimhan, Tim Salimans and Ilya Sutskever.

From ef1b8b2ae5ad1057154a126879f7eb8de685f862 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 22 Oct 2019 21:27:20 +0000
Subject: [PATCH 101/144] [CTRL] warn if generation prompt does not start with
 a control code

see also https://github.com/salesforce/ctrl/pull/50
---
 README.md                         |  2 +-
 examples/README.md                |  2 +-
 examples/run_generation.py        |  5 ++-
 transformers/tokenization_ctrl.py | 59 +++++++++++++++++++++++++++++++
 4 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index e8506d6a39..ecba50a74e 100644
--- a/README.md
+++ b/README.md
@@ -413,7 +413,7 @@ and from the Salesforce CTRL model:
 python ./examples/run_generation.py \
     --model_type=ctrl \
     --length=20 \
-    --model_name_or_path=gpt2 \
+    --model_name_or_path=ctrl \
     --temperature=0 \
     --repetition_penalty=1.2 \
 ```
diff --git a/examples/README.md b/examples/README.md
index 6b68d880eb..3a76a4a830 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -101,7 +101,7 @@ python run_lm_finetuning.py \
 
 Based on the script [`run_generation.py`](https://github.com/huggingface/transformers/blob/master/examples/run_generation.py).
 
-Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL and XLNet.
+Conditional text generation using the auto-regressive models of the library: GPT, GPT-2, Transformer-XL, XLNet, CTRL.
 A similar script is used for our official demo [Write With Transfomer](https://transformer.huggingface.co), where you
 can try out the different models available in the library.
 
diff --git a/examples/run_generation.py b/examples/run_generation.py
index ef58cfd844..ae0e27dcf0 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -196,7 +196,7 @@ def main():
 
     logger.info(args)
     if args.model_type in ["ctrl"]:
-        if args.temperature > 0.7 : 
+        if args.temperature > 0.7:
             logger.info('CTRL typically works better with lower temperatures (and lower top_k).')
 
     while True:
@@ -224,6 +224,9 @@ def main():
             # Models with memory likes to have a long prompt for short inputs.
             raw_text = (args.padding_text if args.padding_text else PADDING_TEXT) + raw_text
         context_tokens = tokenizer.encode(raw_text)
+        if args.model_type == "ctrl":
+            if not any(context_tokens[0] == x for x in tokenizer.control_codes.values()):
+                logger.info("WARNING! You are not starting your generation from a control code so you won't get good results")
         out = sample_sequence(
             model=model,
             context=context_tokens,
diff --git a/transformers/tokenization_ctrl.py b/transformers/tokenization_ctrl.py
index c8d67ad043..3d67fa2c5b 100644
--- a/transformers/tokenization_ctrl.py
+++ b/transformers/tokenization_ctrl.py
@@ -46,6 +46,64 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'ctrl': 256,
 }
 
+CONTROL_CODES = {
+    "Pregnancy": 168629,
+    "Christianity": 7675,
+    "Explain": 106423,
+    "Fitness": 63440,
+    "Saving": 63163,
+    "Ask": 27171,
+    "Ass": 95985,
+    "Joke": 163509,
+    "Questions": 45622,
+    "Thoughts": 49605,
+    "Retail": 52342,
+    "Feminism": 164338,
+    "Writing": 11992,
+    "Atheism": 192263,
+    "Netflix": 48616,
+    "Computing": 39639,
+    "Opinion": 43213,
+    "Alone": 44967,
+    "Funny": 58917,
+    "Gaming": 40358,
+    "Human": 4088,
+    "India": 1331,
+    "Joker": 77138,
+    "Diet": 36206,
+    "Legal": 11859,
+    "Norman": 4939,
+    "Tip": 72689,
+    "Weight": 52343,
+    "Movies": 46273,
+    "Running": 23425,
+    "Science": 2090,
+    "Horror": 37793,
+    "Confession": 60572,
+    "Finance": 12250,
+    "Politics": 16360,
+    "Scary": 191985,
+    "Support": 12654,
+    "Technologies": 32516,
+    "Teenage": 66160,
+    "Event": 32769,
+    "Learned": 67460,
+    "Notion": 182770,
+    "Wikipedia": 37583,
+    "Books": 6665,
+    "Extract": 76050,
+    "Confessions": 102701,
+    "Conspiracy": 75932,
+    "Links": 63674,
+    "Narcissus": 150425,
+    "Relationship": 54766,
+    "Relationships": 134796,
+    "Reviews": 41671,
+    "News": 4256,
+    "Translation": 26820,
+    "multilingual": 128406,
+}
+
 def get_pairs(word):
     """Return set of symbol pairs in a word.
 
@@ -68,6 +126,7 @@ class CTRLTokenizer(PreTrainedTokenizer):
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    control_codes = CONTROL_CODES
 
     def __init__(self, vocab_file, merges_file, unk_token="<unk>", **kwargs):
         super(CTRLTokenizer, self).__init__(unk_token=unk_token, **kwargs)

From bc3e57d551faa5f444107d0a41fcf56f4870ca8c Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 22 Oct 2019 17:51:30 -0400
Subject: [PATCH 102/144] Multi version doc deployment

---
 .circleci/config.yml |  6 +++---
 .circleci/deploy.sh  | 21 +++++++++++++++++++++
 2 files changed, 24 insertions(+), 3 deletions(-)
 create mode 100755 .circleci/deploy.sh

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 637d137492..38d0e291af 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -81,12 +81,12 @@ jobs:
             - checkout
             - run: sudo pip install --progress-bar off -r docs/requirements.txt
             - run: sudo pip install --progress-bar off -r requirements.txt
-            - run: cd docs && make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
+            - run: ./deploy.sh
 workflow_filters: &workflow_filters
     filters:
         branches:
             only:
-                - master
+                - deploy_doc
 workflows:
     version: 2
     build_and_test:
@@ -96,4 +96,4 @@ workflows:
             - build_py3_tf
             - build_py2_torch
             - build_py2_tf
-            - deploy_doc: *workflow_filters
\ No newline at end of file
+            - deploy_doc: *workflow_filters
diff --git a/.circleci/deploy.sh b/.circleci/deploy.sh
new file mode 100755
index 0000000000..19226ae4d5
--- /dev/null
+++ b/.circleci/deploy.sh
@@ -0,0 +1,21 @@
+cd docs
+
+function deploy_doc(){
+	echo "Creating doc at commit $1 and pushing to folder $2"
+	git checkout $1
+	if [ ! -z "$2" ] 
+	then
+		echo "Pushing version" $2
+		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html $doc:$dir/$2
+	else
+		echo "Pushing master"
+		make clean && make html && scp -r -oStrictHostKeyChecking=no _build/html/* $doc:$dir
+	fi
+}
+
+deploy_doc "master" 
+deploy_doc "b33a385" v1.0.0
+deploy_doc "fe02e45" v1.1.0
+eploy_doc "89fd345" v1.2.0
+deploy_doc "fc9faa8" v2.0.0
+deploy_doc "3ddce1d" v2.1.1

From 69eba0ab19fda74ff00f1e4f5bded98a2fec9887 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 22 Oct 2019 17:53:52 -0400
Subject: [PATCH 103/144] Edit script path

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 38d0e291af..4c6423ada4 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -81,7 +81,7 @@ jobs:
             - checkout
             - run: sudo pip install --progress-bar off -r docs/requirements.txt
             - run: sudo pip install --progress-bar off -r requirements.txt
-            - run: ./deploy.sh
+            - run: ./.circleci/deploy.sh
 workflow_filters: &workflow_filters
     filters:
         branches:

From fbcc5ff9fbad6e42d5d852fe315eb4c01707ed2e Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 22 Oct 2019 18:01:10 -0400
Subject: [PATCH 104/144] Change branch to master

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 4c6423ada4..01e6d82b33 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -86,7 +86,7 @@ workflow_filters: &workflow_filters
     filters:
         branches:
             only:
-                - deploy_doc
+                - master
 workflows:
     version: 2
     build_and_test:

From 6e85bccafc8a175c0cc2eed27fd61af087483085 Mon Sep 17 00:00:00 2001
From: Lysandre Debut <lysandre@huggingface.co>
Date: Tue, 22 Oct 2019 18:07:01 -0400
Subject: [PATCH 105/144] Fixed typo

---
 .circleci/deploy.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/deploy.sh b/.circleci/deploy.sh
index 19226ae4d5..2bff0102ae 100755
--- a/.circleci/deploy.sh
+++ b/.circleci/deploy.sh
@@ -16,6 +16,6 @@ function deploy_doc(){
 deploy_doc "master" 
 deploy_doc "b33a385" v1.0.0
 deploy_doc "fe02e45" v1.1.0
-eploy_doc "89fd345" v1.2.0
+deploy_doc "89fd345" v1.2.0
 deploy_doc "fc9faa8" v2.0.0
 deploy_doc "3ddce1d" v2.1.1

From bd847ce7d7a498c3852f6bb31af8f9e781a85f65 Mon Sep 17 00:00:00 2001
From: "focox@qq.com" <https://gitee.com/focox/respiration_detection.git>
Date: Wed, 23 Oct 2019 20:27:13 +0800
Subject: [PATCH 106/144] fixed the bug raised by "tmp_eval_loss +=
 tmp_eval_loss.item()" when parallelly using multi-gpu.

---
 examples/run_ner.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/run_ner.py b/examples/run_ner.py
index 00eb039258..28d9e9db28 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -210,6 +210,9 @@ def evaluate(args, model, tokenizer, labels, pad_token_label_id, mode, prefix=""
             outputs = model(**inputs)
             tmp_eval_loss, logits = outputs[:2]
 
+            if args.n_gpu > 1:
+                tmp_eval_loss = tmp_eval_loss.mean()  # mean() to average on multi-gpu parallel evaluating
+
             eval_loss += tmp_eval_loss.item()
         nb_eval_steps += 1
         if preds is None:

From 8ad5c591cda96a40d2fd2662a6b76af86527289d Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 23 Oct 2019 10:29:47 -0400
Subject: [PATCH 107/144] [RELEASE] DistilRoBERTa

---
 docs/source/pretrained_models.rst |  4 ++++
 examples/distillation/README.md   | 38 +++++++++++++++++++++++--------
 2 files changed, 32 insertions(+), 10 deletions(-)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index 1d02cd0dd7..43c08228bd 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -136,6 +136,10 @@ Here is the full list of the currently provided pretrained models together with
 |                   | ``distilgpt2``                                             | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
 |                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
 |                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``distilroberta-base``                                     | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
+|                   |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | CTRL              | ``ctrl``                                                   | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters                                                                                    |
 |                   |                                                            | | Salesforce's Large-sized CTRL English model                                                                                         |
diff --git a/examples/distillation/README.md b/examples/distillation/README.md
index 0fbcb5628b..344b5f7d46 100644
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -1,25 +1,38 @@
 # Distil*
 
-This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT and DistilGPT2.
+This folder contains the original code used to train Distil* as well as examples showcasing how to use DistilBERT, DistilRoBERTa and DistilGPT2.
 
-**2019, October 3rd - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
+**October 23rd, 2019 - Update** We release **DistilRoBERTa**: 95% of `RoBERTa-base`'s performance on GLUE, twice as fast as RoBERTa while being 35% smaller.
+
+**October 3rd, 2019 - Update** We release our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108) explaining our approach on **DistilBERT**. It includes updated results and further experiments. We applied the same method to GPT2 and release the weights of **DistilGPT2**. DistilGPT2 is two times faster and 33% smaller than GPT2. **The paper superseeds our [previous blogpost](https://medium.com/huggingface/distilbert-8cf3380435b5) with a different distillation loss and better performances. Please use the paper as a reference when comparing/reporting results on DistilBERT.**
+
+**September 19th, 2019 - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 97% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
 
-**2019, September 19th - Update:** We fixed bugs in the code and released an upadted version of the weights trained with a modification of the distillation loss. DistilBERT now reaches 97% of `BERT-base`'s performance on GLUE, and 86.9 F1 score on SQuAD v1.1 dev set (compared to 88.5 for `BERT-base`). We will publish a formal write-up of our approach in the near future!
 
 ## What is Distil*
 
 Distil* is a class of compressed models that started with DistilBERT. DistilBERT stands for Distillated-BERT. DistilBERT is a small, fast, cheap and light Transformer model based on Bert architecture. It has 40% less parameters than `bert-base-uncased`, runs 60% faster while preserving 97% of BERT's performances as measured on the GLUE language understanding benchmark. DistilBERT is trained using knowledge distillation, a technique to compress a large model called the teacher into a smaller model called the student. By distillating Bert, we obtain a smaller Transformer model that bears a lot of similarities with the original BERT model while being lighter, smaller and faster to run. DistilBERT is thus an interesting option to put large-scaled trained Transformer model into production.
 
-We have applied the same method to GPT2 and release the weights of the compressed model. On the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for DistilGPT2 (after fine-tuning on the train set).
+We have applied the same method to other Transformer architectures and released the weights:
+- GPT2: on the [WikiText-103](https://blog.einstein.ai/the-wikitext-long-term-dependency-language-modeling-dataset/) benchmark, GPT2 reaches a perplexity on the test set of 15.0 compared to 18.5 for **DistilGPT2** (after fine-tuning on the train set).
+- RoBERTa: **DistilRoBERTa** reaches 95% of `RoBERTa-base` performance on GLUE while being twice faster and 35% smaller.
+- and more to come! 🤗🤗🤗
 
 For more information on DistilBERT, please refer to our [NeurIPS workshop paper](https://arxiv.org/abs/1910.01108).
 
 Here are the results on the dev sets of GLUE:
 
-| Model      | Macro-score | CoLA | MNLI | MRPC | QNLI | QQP  | RTE  | SST-2| STS-B| WNLI |
-| :---:      |    :---:    | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:|
-| BERT-base  |  **77.6**   | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7 |
-| DistilBERT |  **76.8**   | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4 |
+| Model                     | Macro-score                    | CoLA | MNLI | MRPC | QNLI | QQP  | RTE  | SST-2| STS-B| WNLI              |
+| :---:                     |    :---:                       | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:             |
+| BERT-base                 |  **77.6**                      | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7              |
+| DistilBERT                |  **76.8**                      | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4              |
+| :---:                     |    :---:                       | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:             |
+| RoBERTa-base (reported)   |  **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>**</sup> |
+| DistilRoBERTa<sup>1</sup> |  **79.0**/**82.3**<sup>2</sup> | 59.4 | 83.9 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1              |
+
+<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa. 
+<sup>2</sup> Macro-score computed without WNLI.
+<sup>3</sup> We compute this score ourselves for completeness.
 
 ## Setup
 
@@ -27,13 +40,15 @@ This part of the library has only be tested with Python3.6+. There are few speci
 
 **Important note:** The training scripts have been updated to support PyTorch v1.2.0 (there are breakings changes compared to v1.1.0).
 
+
 ## How to use DistilBERT
 
 Transformers includes two pre-trained Distil* models, currently only provided for English (we are investigating the possibility to train and release a multilingual version of DistilBERT):
 
 - `distilbert-base-uncased`: DistilBERT English language model pretrained on the same data used to pretrain Bert (concatenation of the Toronto Book Corpus and full English Wikipedia) using distillation with the supervision of the `bert-base-uncased` version of Bert. The model has 6 layers, 768 dimension and 12 heads, totalizing 66M parameters.
 - `distilbert-base-uncased-distilled-squad`: A finetuned version of `distilbert-base-uncased` finetuned using (a second step of) knwoledge distillation on SQuAD 1.0. This model reaches a F1 score of 86.9 on the dev set (for comparison, Bert `bert-base-uncased` version reaches a 88.5 F1 score).
-- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset and . The model has 6 layers, 768 dimension and 12 heads, totalizing 82M (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
+- `distilgpt2`: DistilGPT2 English language model pretrained with the supervision of `gpt2` (the smallest version of GPT2) on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset. The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 124M parameters for GPT2). On average, DistilGPT2 is two times faster than GPT2.
+- `distilroberta-base`: DistilRoBERTa English language model pretrained with the supervision of `roberta-base` solely on [OpenWebTextCorpus](https://skylion007.github.io/OpenWebTextCorpus/), a reproduction of OpenAI's WebText dataset (it is ~4 times less training data than the teacher RoBERTa). The model has 6 layers, 768 dimension and 12 heads, totalizing 82M parameters (compared to 125M parameters for RoBERTa-base). On average DistilRoBERTa is twice as fast as Roberta-base.
 - and more to come! 🤗🤗🤗
 
 Using DistilBERT is very similar to using BERT. DistilBERT share the same tokenizer as BERT's `bert-base-uncased` even though we provide a link to this tokenizer under the `DistilBertTokenizer` name to have a consistent naming between the library models.
@@ -47,7 +62,10 @@ outputs = model(input_ids)
 last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 ```
 
-Similarly, using DistilGPT2 simply consists in calling the GPT2 classes from a different pretrained checkpoint: `model = GPT2Model.from_pretrained('distilgpt2')`.
+Similarly, using the other Distil* models simply consists in calling the base classes with a different pretrained checkpoint:
+- DistilGPT2: `model = GPT2Model.from_pretrained('distilgpt2')`
+- DistilRoBERTa: `model = RobertaModel.from_pretrained('distilroberta-base')`
+
 
 ## How to train Distil*
 

From 5b6cafb11b39e78724dc13b57b81bd73c9a66b49 Mon Sep 17 00:00:00 2001
From: VictorSanh <victorsanh@gmail.com>
Date: Wed, 23 Oct 2019 10:35:16 -0400
Subject: [PATCH 108/144] [release] fix table weirdness

---
 examples/distillation/README.md | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/examples/distillation/README.md b/examples/distillation/README.md
index 344b5f7d46..7da1ad015b 100644
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -26,12 +26,14 @@ Here are the results on the dev sets of GLUE:
 | :---:                     |    :---:                       | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:             |
 | BERT-base                 |  **77.6**                      | 48.9 | 84.3 | 88.6 | 89.3 | 89.5 | 71.3 | 91.7 | 91.2 | 43.7              |
 | DistilBERT                |  **76.8**                      | 49.1 | 81.8 | 90.2 | 90.2 | 89.2 | 62.9 | 92.7 | 90.7 | 44.4              |
-| :---:                     |    :---:                       | :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:| :---:             |
-| RoBERTa-base (reported)   |  **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>**</sup> |
+| ---                       |    ---                         |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  --- |  ---              |
+| RoBERTa-base (reported)   |  **83.2**/**86.4**<sup>2</sup> | 63.6 | 87.6 | 90.2 | 92.8 | 91.9 | 78.7 | 94.8 | 91.2 | 57.7<sup>3</sup> |
 | DistilRoBERTa<sup>1</sup> |  **79.0**/**82.3**<sup>2</sup> | 59.4 | 83.9 | 86.6 | 90.8 | 89.4 | 67.9 | 92.5 | 88.3 | 52.1              |
 
-<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa. 
+<sup>1</sup> We did not use the MNLI checkpoint for fine-tuning but directy perform transfer learning on the pre-trained DistilRoBERTa.
+
 <sup>2</sup> Macro-score computed without WNLI.
+
 <sup>3</sup> We compute this score ourselves for completeness.
 
 ## Setup

From b82bfbd0c307ba84da4f326900f1479df977efeb Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Thu, 24 Oct 2019 15:55:31 +0000
Subject: [PATCH 109/144] Updated README to show all available documentation

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e8506d6a39..3d89c65901 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ Choose the right framework for every part of a model's lifetime
 | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
 | [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
 | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
-| [Documentation](https://huggingface.co/transformers/) | Full API documentation and more |
+| [Documentation](https://huggingface.co/transformers/) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) | Full API documentation and more |
 
 ## Installation
 

From 66085a132161d3257bb971d886bea1b52a476e4e Mon Sep 17 00:00:00 2001
From: Matt Maybeno <mmaybeno@gmail.com>
Date: Wed, 23 Oct 2019 21:05:13 -0700
Subject: [PATCH 110/144] RoBERTa token classification

[WIP] copy paste bert token classification for roberta
---
 transformers/__init__.py                      |  2 +
 transformers/modeling_roberta.py              | 72 +++++++++++++++++++
 transformers/modeling_tf_roberta.py           | 51 +++++++++++++
 transformers/tests/modeling_roberta_test.py   | 19 ++++-
 .../tests/modeling_tf_roberta_test.py         | 15 ++++
 5 files changed, 158 insertions(+), 1 deletion(-)

diff --git a/transformers/__init__.py b/transformers/__init__.py
index fbc92f078e..dbc66f86b9 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -89,6 +89,7 @@ if is_torch_available():
                             XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
     from .modeling_roberta import (RobertaForMaskedLM, RobertaModel,
                                 RobertaForSequenceClassification, RobertaForMultipleChoice,
+                                RobertaForTokenClassification,
                                 ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
     from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
                                 DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
@@ -139,6 +140,7 @@ if is_tf_available():
     from .modeling_tf_roberta import (TFRobertaPreTrainedModel, TFRobertaMainLayer,
                                       TFRobertaModel, TFRobertaForMaskedLM,
                                       TFRobertaForSequenceClassification,
+                                      TFRobertaForTokenClassification,
                                       TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
 
     from .modeling_tf_distilbert import (TFDistilBertPreTrainedModel, TFDistilBertMainLayer,
diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py
index eb340dc7fb..6b8d381579 100644
--- a/transformers/modeling_roberta.py
+++ b/transformers/modeling_roberta.py
@@ -343,6 +343,7 @@ class RobertaForSequenceClassification(BertPreTrainedModel):
 
         return outputs  # (loss), logits, (hidden_states), (attentions)
 
+
 @add_start_docstrings("""Roberta Model with a multiple choice classification head on top (a linear layer on top of
     the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
     ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
@@ -451,6 +452,77 @@ class RobertaForMultipleChoice(BertPreTrainedModel):
         return outputs  # (loss), reshaped_logits, (hidden_states), (attentions)
 
 
+@add_start_docstrings("""Roberta Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+class RobertaForTokenClassification(BertPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        model = RobertaForTokenClassification.from_pretrained('roberta-base')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(RobertaForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = RobertaModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
+                position_ids=None, head_mask=None, labels=None):
+
+        outputs = self.roberta(input_ids,
+                               attention_mask=attention_mask,
+                               token_type_ids=token_type_ids,
+                               position_ids=position_ids,
+                               head_mask=head_mask)
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), scores, (hidden_states), (attentions)
+
 
 class RobertaClassificationHead(nn.Module):
     """Head for sentence-level classification tasks."""
diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py
index 244c83f2b3..13a0522211 100644
--- a/transformers/modeling_tf_roberta.py
+++ b/transformers/modeling_tf_roberta.py
@@ -371,3 +371,54 @@ class TFRobertaForSequenceClassification(TFRobertaPreTrainedModel):
         outputs = (logits,) + outputs[2:]
 
         return outputs  # logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""RoBERTa Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
+class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import RobertaTokenizer, TFRobertaForTokenClassification
+
+        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        scores = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFRobertaForTokenClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.roberta = TFRobertaMainLayer(config, name='roberta')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='classifier')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.roberta(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # scores, (hidden_states), (attentions)
diff --git a/transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py
index 82e10da915..0620ddf630 100644
--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -24,7 +24,8 @@ from transformers import is_torch_available
 
 if is_torch_available():
     import torch
-    from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
+    from transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM,
+                              RobertaForSequenceClassification, RobertaForTokenClassification)
     from transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 else:
     pytestmark = pytest.mark.skip("Require Torch")
@@ -156,6 +157,22 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                 [self.batch_size, self.seq_length, self.vocab_size])
             self.check_loss_output(result)
 
+        def create_and_check_roberta_for_token_classification(self, config, input_ids, token_type_ids, input_mask,
+                                                              sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = RobertaForTokenClassification(config=config)
+            model.eval()
+            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
+                                 labels=token_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.seq_length, self.num_labels])
+            self.check_loss_output(result)
+
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids, token_type_ids, input_mask,
diff --git a/transformers/tests/modeling_tf_roberta_test.py b/transformers/tests/modeling_tf_roberta_test.py
index 735c9aae27..edbfa4e205 100644
--- a/transformers/tests/modeling_tf_roberta_test.py
+++ b/transformers/tests/modeling_tf_roberta_test.py
@@ -30,6 +30,7 @@ if is_tf_available():
     import numpy
     from transformers.modeling_tf_roberta import (TFRobertaModel, TFRobertaForMaskedLM,
                                                           TFRobertaForSequenceClassification,
+                                                          TFRobertaForTokenClassification,
                                                           TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
 else:
     pytestmark = pytest.mark.skip("Require TensorFlow")
@@ -154,6 +155,20 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
                 list(result["prediction_scores"].shape),
                 [self.batch_size, self.seq_length, self.vocab_size])
 
+        def create_and_check_roberta_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = TFRobertaForTokenClassification(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            logits, = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.seq_length, self.num_labels])
+
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
             (config, input_ids, token_type_ids, input_mask,

From b92d68421dee75c3a078b26b78a05bd59007d855 Mon Sep 17 00:00:00 2001
From: Matt Maybeno <mmaybeno@gmail.com>
Date: Wed, 23 Oct 2019 21:31:28 -0700
Subject: [PATCH 111/144] Use roberta model and update doc strings

---
 transformers/modeling_roberta.py    | 6 +++++-
 transformers/modeling_tf_roberta.py | 2 +-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/transformers/modeling_roberta.py b/transformers/modeling_roberta.py
index 6b8d381579..9d16c87888 100644
--- a/transformers/modeling_roberta.py
+++ b/transformers/modeling_roberta.py
@@ -478,12 +478,16 @@ class RobertaForTokenClassification(BertPreTrainedModel):
 
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
         model = RobertaForTokenClassification.from_pretrained('roberta-base')
-        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
         labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids, labels=labels)
         loss, scores = outputs[:2]
 
     """
+    config_class = RobertaConfig
+    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "roberta"
+
     def __init__(self, config):
         super(RobertaForTokenClassification, self).__init__(config)
         self.num_labels = config.num_labels
diff --git a/transformers/modeling_tf_roberta.py b/transformers/modeling_tf_roberta.py
index 13a0522211..a239bc642b 100644
--- a/transformers/modeling_tf_roberta.py
+++ b/transformers/modeling_tf_roberta.py
@@ -396,7 +396,7 @@ class TFRobertaForTokenClassification(TFRobertaPreTrainedModel):
 
         tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
         model = TFRobertaForTokenClassification.from_pretrained('roberta-base')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :]  # Batch size 1
         outputs = model(input_ids)
         scores = outputs[0]
 

From 4e5f88b74fa914a5f45aec3260977acfc3513536 Mon Sep 17 00:00:00 2001
From: Matt Maybeno <mmaybeno@gmail.com>
Date: Wed, 23 Oct 2019 22:50:03 -0700
Subject: [PATCH 112/144] Add Roberta to run_ner.py

---
 examples/run_ner.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/run_ner.py b/examples/run_ner.py
index 00eb039258..16fa89c3e7 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -35,15 +35,17 @@ from utils_ner import convert_examples_to_features, get_labels, read_examples_fr
 
 from transformers import AdamW, WarmupLinearSchedule
 from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, BertTokenizer
+from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
 
 logger = logging.getLogger(__name__)
 
 ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, )),
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig)),
     ())
 
 MODEL_CLASSES = {
     "bert": (BertConfig, BertForTokenClassification, BertTokenizer),
+    "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer)
 }
 
 

From ae1d03fc51bb22ed59517ee6f92c560417fdb049 Mon Sep 17 00:00:00 2001
From: Matt Maybeno <mmaybeno@gmail.com>
Date: Thu, 24 Oct 2019 10:43:57 -0700
Subject: [PATCH 113/144] Add roberta to doc

---
 examples/run_ner.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_ner.py b/examples/run_ner.py
index 16fa89c3e7..740b422429 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert). """
+""" Fine-tuning the library models for named entity recognition on CoNLL-2003 (Bert or Roberta). """
 
 from __future__ import absolute_import, division, print_function
 

From bab6ad01aad6f6a8cbf5b8634d890d8fce9f46d1 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Thu, 24 Oct 2019 21:41:45 +0000
Subject: [PATCH 114/144] run_tf_glue works with all tasks

---
 examples/run_tf_glue.py               | 43 ++++++++++++++++++++++-----
 transformers/data/processors/glue.py  |  4 +++
 transformers/data/processors/utils.py |  7 +++++
 3 files changed, 46 insertions(+), 8 deletions(-)

diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py
index 399fe9e616..73173b0cf1 100644
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -1,29 +1,47 @@
 import os
 import tensorflow as tf
 import tensorflow_datasets
-from transformers import BertTokenizer, TFBertForSequenceClassification, glue_convert_examples_to_features, BertForSequenceClassification
+from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig, glue_convert_examples_to_features, BertForSequenceClassification, glue_processors
 
 # script parameters
 BATCH_SIZE = 32
 EVAL_BATCH_SIZE = BATCH_SIZE * 2
 USE_XLA = False
 USE_AMP = False
+EPOCHS = 3
+
+TASK = "mrpc"
+
+if TASK == "sst-2":
+    TFDS_TASK = "sst2"
+elif TASK == "sts-b":
+    TFDS_TASK = "stsb"
+else: 
+    TFDS_TASK = TASK
+
+num_labels = len(glue_processors[TASK]().get_labels())
+print(num_labels)
 
 tf.config.optimizer.set_jit(USE_XLA)
 tf.config.optimizer.set_experimental_options({"auto_mixed_precision": USE_AMP})
 
-# Load tokenizer and model from pretrained model/vocabulary
+# Load tokenizer and model from pretrained model/vocabulary. Specify the number of labels to classify (2+: classification, 1: regression)
+config = BertConfig.from_pretrained("bert-base-cased", num_labels=num_labels)
 tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
-model = TFBertForSequenceClassification.from_pretrained('bert-base-cased')
+model = TFBertForSequenceClassification.from_pretrained('bert-base-cased', config=config)
 
 # Load dataset via TensorFlow Datasets
-data, info = tensorflow_datasets.load('glue/mrpc', with_info=True)
+data, info = tensorflow_datasets.load(f'glue/{TFDS_TASK}', with_info=True)
 train_examples = info.splits['train'].num_examples
+
+# MNLI expects either validation_matched or validation_mismatched
 valid_examples = info.splits['validation'].num_examples
 
 # Prepare dataset for GLUE as a tf.data.Dataset instance
-train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, 'mrpc')
-valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, 'mrpc')
+train_dataset = glue_convert_examples_to_features(data['train'], tokenizer, 128, TASK)
+
+# MNLI expects either validation_matched or validation_mismatched
+valid_dataset = glue_convert_examples_to_features(data['validation'], tokenizer, 128, TASK)
 train_dataset = train_dataset.shuffle(128).batch(BATCH_SIZE).repeat(-1)
 valid_dataset = valid_dataset.batch(EVAL_BATCH_SIZE)
 
@@ -32,7 +50,13 @@ opt = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08)
 if USE_AMP:
     # loss scaling is currently required when using mixed precision
     opt = tf.keras.mixed_precision.experimental.LossScaleOptimizer(opt, 'dynamic')
-loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
+
+if num_labels == 1:
+    loss = tf.keras.losses.MeanSquaredError()
+else:
+    loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
 metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
 model.compile(optimizer=opt, loss=loss, metrics=[metric])
 
@@ -40,7 +64,7 @@ model.compile(optimizer=opt, loss=loss, metrics=[metric])
 train_steps = train_examples//BATCH_SIZE
 valid_steps = valid_examples//EVAL_BATCH_SIZE
 
-history = model.fit(train_dataset, epochs=2, steps_per_epoch=train_steps,
+history = model.fit(train_dataset, epochs=EPOCHS, steps_per_epoch=train_steps,
                     validation_data=valid_dataset, validation_steps=valid_steps)
 
 # Save TF2 model
@@ -57,6 +81,9 @@ sentence_2 = 'His findings were not compatible with this research.'
 inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
 inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
 
+del inputs_1["special_tokens_mask"]
+del inputs_2["special_tokens_mask"]
+
 pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
 pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
 print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
diff --git a/transformers/data/processors/glue.py b/transformers/data/processors/glue.py
index 741569ea30..c81582fb72 100644
--- a/transformers/data/processors/glue.py
+++ b/transformers/data/processors/glue.py
@@ -76,10 +76,14 @@ def glue_convert_examples_to_features(examples, tokenizer,
 
     features = []
     for (ex_index, example) in enumerate(examples):
+        if ex_index == 10:
+            break
+        
         if ex_index % 10000 == 0:
             logger.info("Writing example %d" % (ex_index))
         if is_tf_dataset:
             example = processor.get_example_from_tensor_dict(example)
+            example = processor.tfds_map(example)
 
         inputs = tokenizer.encode_plus(
             example.text_a,
diff --git a/transformers/data/processors/utils.py b/transformers/data/processors/utils.py
index 27138f9959..07bdf3150c 100644
--- a/transformers/data/processors/utils.py
+++ b/transformers/data/processors/utils.py
@@ -107,6 +107,13 @@ class DataProcessor(object):
         """Gets the list of labels for this data set."""
         raise NotImplementedError()
 
+    def tfds_map(self, example):
+        """Some tensorflow_datasets datasets are not formatted the same way the GLUE datasets are. 
+        This method converts examples to the correct format."""
+        if len(self.get_labels()) > 1:
+            example.label = self.get_labels()[int(example.label)]
+        return example
+
     @classmethod
     def _read_tsv(cls, input_file, quotechar=None):
         """Reads a tab separated value file."""

From beaf66b1f30aa29e11a02ecb5a7edb6b7b99eb01 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Thu, 24 Oct 2019 21:43:28 +0000
Subject: [PATCH 115/144] Remove break

---
 transformers/data/processors/glue.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/transformers/data/processors/glue.py b/transformers/data/processors/glue.py
index c81582fb72..518251b050 100644
--- a/transformers/data/processors/glue.py
+++ b/transformers/data/processors/glue.py
@@ -76,9 +76,6 @@ def glue_convert_examples_to_features(examples, tokenizer,
 
     features = []
     for (ex_index, example) in enumerate(examples):
-        if ex_index == 10:
-            break
-        
         if ex_index % 10000 == 0:
             logger.info("Writing example %d" % (ex_index))
         if is_tf_dataset:

From f873a3edb2eba78d92e55ecb55902f7e9cb90777 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 15:21:46 +0200
Subject: [PATCH 116/144] the decoder attends to the output of the encoder
 stack (last layer)

---
 transformers/modeling_bert.py    | 13 ++++++-------
 transformers/modeling_seq2seq.py |  2 +-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index be8ec5ba21..aa022bac8a 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -288,8 +288,8 @@ class BertAttention(nn.Module):
         self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_state=None, encoder_attention_mask=None):
-        self_outputs = self.self(hidden_states, attention_mask, head_mask, encoder_hidden_state, encoder_attention_mask)
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
+        self_outputs = self.self(hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask)
         attention_output = self.output(self_outputs[0], hidden_states)
         outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
         return outputs
@@ -334,13 +334,13 @@ class BertLayer(nn.Module):
         self.intermediate = BertIntermediate(config)
         self.output = BertOutput(config)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_state=None, encoder_attention_mask=None):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
         self_attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
         attention_output = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
 
-        if self.is_decoder and encoder_hidden_state is not None:
-            cross_attention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_state, encoder_attention_mask)
+        if self.is_decoder and encoder_hidden_states is not None:
+            cross_attention_outputs = self.crossattention(attention_output, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask)
             attention_output = cross_attention_outputs[0]
             outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
 
@@ -364,8 +364,7 @@ class BertEncoder(nn.Module):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
-            encoder_hidden_state = encoder_hidden_states[i]
-            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], encoder_hidden_state, encoder_attention_mask)
+            layer_outputs = layer_module(hidden_states, attention_mask, head_mask[i], encoder_hidden_states, encoder_attention_mask)
             hidden_states = layer_outputs[0]
 
             if self.output_attentions:
diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
index ca3b9dc87a..108fdaa853 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_seq2seq.py
@@ -165,7 +165,7 @@ class PreTrainedSeq2seq(nn.Module):
         encoder_hidden_states = kwargs_encoder.pop("encoder_hidden_states", None)
         if encoder_hidden_states is None:
             encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
-            encoder_hidden_states = encoder_outputs[0]
+            encoder_hidden_states = encoder_outputs[0][-1]  # output of the encoder *stack*
         else:
             encoder_outputs = ()
 

From dc580dd4c720c5daefe7411f604b6908da99681e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 16:56:36 +0200
Subject: [PATCH 117/144] add lm_labels for the LM cross-entropy

---
 transformers/modeling_bert.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index aa022bac8a..d10f32c1fa 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -819,7 +819,7 @@ class BertForMaskedLM(BertPreTrainedModel):
                                    self.bert.embeddings.word_embeddings)
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None):
+                masked_lm_labels=None, lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None):
 
         outputs = self.bert(input_ids,
                             attention_mask=attention_mask,
@@ -840,7 +840,7 @@ class BertForMaskedLM(BertPreTrainedModel):
         #    of predictions for masked words.
         # 2. If encoder hidden states are provided we are in a causal situation where we
         #    try to predict the next word for each input in the encoder.
-        if masked_lm_labels is not None and encoder_hidden_states is not None:
+        if masked_lm_labels is not None and lm_labels is not None:
             raise AttributeError("Masked LM training with an encoder-decoder is not supported.")
 
         if masked_lm_labels is not None:
@@ -848,12 +848,12 @@ class BertForMaskedLM(BertPreTrainedModel):
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
             outputs = (masked_lm_loss,) + outputs
 
-        if encoder_hidden_states is not None:
+        if lm_labels is not None:
             # we are doing next-token prediction; shift prediction scores and input ids by one
             prediction_scores = prediction_scores[:, :-1, :]
-            input_ids = input_ids[:, 1:, :]
+            lm_labels = lm_labels[:, 1:, :]
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            seq2seq_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), input_ids.view(-1))
+            seq2seq_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1))
             outputs = (seq2seq_loss,) + outputs
 
         return outputs  # (mlm_or_seq2seq_loss), prediction_scores, (hidden_states), (attentions)

From b915ba9dfe51db8161db5bc599df3944646b2b72 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 17:44:20 +0200
Subject: [PATCH 118/144] pad sequence with 0, mask with -1

---
 examples/run_seq2seq_finetuning.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index 38dcb2d005..1f21cff82c 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -58,7 +58,7 @@ class TextDataset(Dataset):
     [2] https://github.com/abisee/cnn-dailymail/
     """
 
-    def __init__(self, tokenizer, prefix='train', data_dir="", block_size=512):
+    def __init__(self, tokenizer, prefix="train", data_dir="", block_size=512):
         assert os.path.isdir(data_dir)
 
         # Load features that have already been computed if present
@@ -165,7 +165,12 @@ def _fit_to_block_size(sequence, block_size):
     if len(sequence) > block_size:
         return sequence[:block_size]
     else:
-        return sequence.extend([-1] * (block_size - len(sequence)))
+        return sequence.extend([0] * (block_size - len(sequence)))
+
+
+def mask_padding_tokens(sequence):
+    """ Replace the padding token with -1 values """
+    return [s if s != 0 else -1 for s in sequence]
 
 
 def load_and_cache_examples(args, tokenizer):
@@ -219,11 +224,8 @@ def train(args, train_dataset, model, tokenizer):
     logger.info("***** Running training *****")
     logger.info("  Num examples = %d", len(train_dataset))
     logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info(
-        "  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size
-    )
-    logger.info(
-        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
         args.train_batch_size
         * args.gradient_accumulation_steps
         * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
@@ -242,7 +244,7 @@ def train(args, train_dataset, model, tokenizer):
             source = ([s for s, _ in batch]).to(args.device)
             target = ([t for _, t in batch]).to(args.device)
             model.train()
-            outputs = model(source, target)
+            outputs = model(source, target, decoder_lm_labels=mask_padding_tokens(target))
             loss = outputs[0]
             loss.backward()
 

From cb26b035c696f32b7f47df18a6d84b88b7b1745d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 17:52:32 +0200
Subject: [PATCH 119/144] remove potential UndefinedError

---
 transformers/modeling_xlm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py
index f1df6f668f..166b98de63 100644
--- a/transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -81,8 +81,8 @@ def get_masks(slen, lengths, causal, padding_mask=None):
         mask = alen < lengths[:, None]
 
     # attention mask is the same as mask, or triangular inferior attention (causal)
+    bs = lengths.size(0)
     if causal:
-        bs = lengths.size(0)
         attn_mask = alen[None, None, :].repeat(bs, slen, 1) <= alen[None, :, None]
     else:
         attn_mask = mask

From a67413ccc82ed6bfdf9ea2f6b17ee3869f2f87a7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 18:08:09 +0200
Subject: [PATCH 120/144] extend works in-place

---
 examples/run_seq2seq_finetuning.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
index 1f21cff82c..61c4abfe6e 100644
--- a/examples/run_seq2seq_finetuning.py
+++ b/examples/run_seq2seq_finetuning.py
@@ -165,7 +165,8 @@ def _fit_to_block_size(sequence, block_size):
     if len(sequence) > block_size:
         return sequence[:block_size]
     else:
-        return sequence.extend([0] * (block_size - len(sequence)))
+        sequence.extend([0] * (block_size - len(sequence)))
+        return sequence
 
 
 def mask_padding_tokens(sequence):

From 932543f77ee69b776a2ea4c09f09745624d4c6a3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 17 Oct 2019 21:07:55 +0200
Subject: [PATCH 121/144] fix test of truncation function

---
 examples/run_seq2seq_finetuning_test.py | 48 +++++++------------------
 1 file changed, 13 insertions(+), 35 deletions(-)

diff --git a/examples/run_seq2seq_finetuning_test.py b/examples/run_seq2seq_finetuning_test.py
index e59f016da4..77dc58666c 100644
--- a/examples/run_seq2seq_finetuning_test.py
+++ b/examples/run_seq2seq_finetuning_test.py
@@ -21,43 +21,21 @@ class DataLoaderTest(unittest.TestCase):
     def setUp(self):
         self.block_size = 10
 
-    def test_truncate_source_and_target_too_small(self):
-        """ When the sum of the lengths of the source and target sequences is
-        smaller than the block size (minus the number of special tokens), skip the example. """
-        src_seq = [1, 2, 3, 4]
-        tgt_seq = [5, 6]
-        self.assertEqual(_fit_to_block_size(src_seq, tgt_seq, self.block_size), None)
+    def test_truncate_sequence_too_small(self):
+        """ Pad the sequence with 0 if the sequence is smaller than the block size."""
+        sequence = [1, 2, 3, 4]
+        expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
+        self.assertEqual(_fit_to_block_size(sequence, self.block_size), expected_output)
 
-    def test_truncate_source_and_target_fit_exactly(self):
-        """ When the sum of the lengths of the source and target sequences is
-        equal to the block size (minus the number of special tokens), return the
-        sequences unchanged. """
-        src_seq = [1, 2, 3, 4]
-        tgt_seq = [5, 6, 7]
-        fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
-        self.assertListEqual(src_seq, fitted_src)
-        self.assertListEqual(tgt_seq, fitted_tgt)
+    def test_truncate_sequence_fit_exactly(self):
+        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        self.assertEqual(_fit_to_block_size(sequence, self.block_size), expected_output)
 
-    def test_truncate_source_too_big_target_ok(self):
-        src_seq = [1, 2, 3, 4, 5, 6]
-        tgt_seq = [1, 2]
-        fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
-        self.assertListEqual(fitted_src, [1, 2, 3, 4, 5])
-        self.assertListEqual(fitted_tgt, fitted_tgt)
-
-    def test_truncate_target_too_big_source_ok(self):
-        src_seq = [1, 2, 3, 4]
-        tgt_seq = [1, 2, 3, 4]
-        fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
-        self.assertListEqual(fitted_src, src_seq)
-        self.assertListEqual(fitted_tgt, [1, 2, 3])
-
-    def test_truncate_source_and_target_too_big(self):
-        src_seq = [1, 2, 3, 4, 5, 6, 7]
-        tgt_seq = [1, 2, 3, 4, 5, 6, 7]
-        fitted_src, fitted_tgt = _fit_to_block_size(src_seq, tgt_seq, self.block_size)
-        self.assertListEqual(fitted_src, [1, 2, 3, 4, 5])
-        self.assertListEqual(fitted_tgt, [1, 2])
+    def test_truncate_sequence_too_big(self):
+        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
+        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        self.assertEqual(_fit_to_block_size(sequence, self.block_size), expected_output)
 
     def test_process_story_no_highlights(self):
         """ Processing a story with no highlights should raise an exception.

From 4c3ac4a7d83cdf37b796d783bb66a89bbd09ef9d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Fri, 18 Oct 2019 12:29:30 +0200
Subject: [PATCH 122/144] here's one big commit

---
 examples/README.md                            |   5 +-
 examples/run_seq2seq_finetuning.py            | 361 ----------
 examples/run_summarization_finetuning.py      | 620 ++++++++++++++++++
 ...y => run_summarization_finetuning_test.py} |  28 +-
 transformers/__init__.py                      |   2 +-
 transformers/modeling_beam_search.py          | 240 +++++++
 transformers/modeling_bert.py                 |  20 +-
 transformers/modeling_seq2seq.py              |  83 ++-
 8 files changed, 951 insertions(+), 408 deletions(-)
 delete mode 100644 examples/run_seq2seq_finetuning.py
 create mode 100644 examples/run_summarization_finetuning.py
 rename examples/{run_seq2seq_finetuning_test.py => run_summarization_finetuning_test.py} (79%)
 create mode 100644 transformers/modeling_beam_search.py

diff --git a/examples/README.md b/examples/README.md
index e0fe1fc704..bec6d57171 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -393,7 +393,8 @@ This fine-tuned model is available as a checkpoint under the reference
 
 ## Seq2seq model fine-tuning
 
-Based on the script [`run_seq2seq_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_seq2seq_finetuning.py).
+Based on the script
+[`run_summarization_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_summarization_finetuning.py).
 
 Before running this script you should download **both** CNN and Daily Mail
 datasets from [Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/)  (the
@@ -412,7 +413,7 @@ archive.
 ```bash
 export DATA_PATH=/path/to/dataset/
 
-python run_seq2seq_finetuning.py \
+python run_summarization_finetuning.py \
     --output_dir=output \
     --model_type=bert2bert \
     --model_name_or_path=bert2bert \
diff --git a/examples/run_seq2seq_finetuning.py b/examples/run_seq2seq_finetuning.py
deleted file mode 100644
index 61c4abfe6e..0000000000
--- a/examples/run_seq2seq_finetuning.py
+++ /dev/null
@@ -1,361 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The Microsoft Reseach team and The HuggingFace Inc. team.
-# Copyright (c) 2018 Microsoft and The HuggingFace Inc.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Finetuning seq2seq models for sequence generation."""
-
-import argparse
-from collections import deque
-import logging
-import pickle
-import random
-import os
-
-import numpy as np
-from tqdm import tqdm, trange
-import torch
-from torch.utils.data import Dataset, RandomSampler
-
-from transformers import AutoTokenizer, Model2Model
-
-logger = logging.getLogger(__name__)
-
-
-def set_seed(args):
-    random.seed(args.seed)
-    np.random.seed(args.seed)
-    torch.manual_seed(args.seed)
-
-
-# ------------
-# Load dataset
-# ------------
-
-
-class TextDataset(Dataset):
-    """ Abstracts the dataset used to train seq2seq models.
-
-    CNN/Daily News:
-
-    The CNN/Daily News raw datasets are downloaded from [1]. The stories are
-    stored in different files; the summary appears at the end of the story as
-    sentences that are prefixed by the special `@highlight` line. To process
-    the data, untar both datasets in the same folder, and pass the path to this
-    folder as the "data_dir argument. The formatting code was inspired by [2].
-
-    [1] https://cs.nyu.edu/~kcho/
-    [2] https://github.com/abisee/cnn-dailymail/
-    """
-
-    def __init__(self, tokenizer, prefix="train", data_dir="", block_size=512):
-        assert os.path.isdir(data_dir)
-
-        # Load features that have already been computed if present
-        cached_features_file = os.path.join(
-            data_dir, "cached_lm_{}_{}".format(block_size, prefix)
-        )
-        if os.path.exists(cached_features_file):
-            logger.info("Loading features from cached file %s", cached_features_file)
-            with open(cached_features_file, "rb") as source:
-                self.examples = pickle.load(source)
-                return
-
-        logger.info("Creating features from dataset at %s", data_dir)
-        self.examples = []
-        datasets = ["cnn", "dailymail"]
-        for dataset in datasets:
-            path_to_stories = os.path.join(data_dir, dataset, "stories")
-            assert os.path.isdir(path_to_stories)
-
-            story_filenames_list = os.listdir(path_to_stories)
-            for story_filename in story_filenames_list:
-                path_to_story = os.path.join(path_to_stories, story_filename)
-                if not os.path.isfile(path_to_story):
-                    continue
-
-                with open(path_to_story, encoding="utf-8") as source:
-                    try:
-                        raw_story = source.read()
-                        story, summary = process_story(raw_story)
-                    except IndexError:  # skip ill-formed stories
-                        continue
-
-                story = tokenizer.encode(story)
-                story_seq = _fit_to_block_size(story, block_size)
-
-                summary = tokenizer.encode(summary)
-                summary_seq = _fit_to_block_size(summary, block_size)
-
-                self.examples.append((story_seq, summary_seq))
-
-        logger.info("Saving features into cache file %s", cached_features_file)
-        with open(cached_features_file, "wb") as sink:
-            pickle.dump(self.examples, sink, protocol=pickle.HIGHEST_PROTOCOL)
-
-    def __len__(self):
-        return len(self.examples)
-
-    def __getitem__(self, items):
-        return torch.tensor(self.examples[items])
-
-
-def process_story(raw_story):
-    """ Extract the story and summary from a story file.
-
-    Attributes:
-        raw_story (str): content of the story file as an utf-8 encoded string.
-
-    Raises:
-        IndexError: If the stoy is empty or contains no highlights.
-    """
-    file_lines = list(
-        filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])
-    )
-
-    # for some unknown reason some lines miss a period, add it
-    file_lines = [_add_missing_period(line) for line in file_lines]
-
-    # gather article lines
-    story_lines = []
-    lines = deque(file_lines)
-    while True:
-        try:
-            element = lines.popleft()
-            if element.startswith("@highlight"):
-                break
-            story_lines.append(element)
-        except IndexError as ie:  # if "@highlight" absent from file
-            raise ie
-
-    # gather summary lines
-    highlights_lines = list(filter(lambda t: not t.startswith("@highlight"), lines))
-
-    # join the lines
-    story = " ".join(story_lines)
-    summary = " ".join(highlights_lines)
-
-    return story, summary
-
-
-def _add_missing_period(line):
-    END_TOKENS = [".", "!", "?", "...", "'", "`", '"', u"\u2019", u"\u2019", ")"]
-    if line.startswith("@highlight"):
-        return line
-    if line[-1] in END_TOKENS:
-        return line
-    return line + "."
-
-
-def _fit_to_block_size(sequence, block_size):
-    """ Adapt the source and target sequences' lengths to the block size.
-    If the sequence is shorter than the block size we pad it with -1 ids
-    which correspond to padding tokens.
-    """
-    if len(sequence) > block_size:
-        return sequence[:block_size]
-    else:
-        sequence.extend([0] * (block_size - len(sequence)))
-        return sequence
-
-
-def mask_padding_tokens(sequence):
-    """ Replace the padding token with -1 values """
-    return [s if s != 0 else -1 for s in sequence]
-
-
-def load_and_cache_examples(args, tokenizer):
-    dataset = TextDataset(tokenizer, data_dir=args.data_dir)
-    return dataset
-
-
-# ------------
-# Train
-# ------------
-
-
-def train(args, train_dataset, model, tokenizer):
-    """ Fine-tune the pretrained model on the corpus. """
-
-    # Prepare the data loading
-    args.train_bach_size = 1
-    train_sampler = RandomSampler(train_dataset)
-    train_dataloader = DataLoader(
-        train_dataset, sampler=train_sampler, batch_size=args.train_bach_size
-    )
-
-    # Prepare the optimizer and schedule (linear warmup and decay)
-    no_decay = ["bias", "LayerNorm.weight"]
-    optimizer_grouped_parameters = [
-        {
-            "params": [
-                p
-                for n, p in model.named_parameters()
-                if not any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": args.weight_decay,
-        },
-        {
-            "params": [
-                p
-                for n, p in model.named_parameters()
-                if any(nd in n for nd in no_decay)
-            ],
-            "weight_decay": 0.0,
-        },
-    ]
-    optimizer = AdamW(
-        optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon
-    )
-    scheduler = WarmupLinearSchedule(
-        optimizer, warmup_steps=args.warmup_steps, t_total=t_total
-    )
-
-    # Train
-    logger.info("***** Running training *****")
-    logger.info("  Num examples = %d", len(train_dataset))
-    logger.info("  Num Epochs = %d", args.num_train_epochs)
-    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
-    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
-        args.train_batch_size
-        * args.gradient_accumulation_steps
-        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
-    )
-    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
-    logger.info("  Total optimization steps = %d", t_total)
-
-    global_step = 0
-    tr_loss, logging_loss = 0.0, 0.0
-    model.zero_grad()
-    train_iterator = trange(args.num_train_epochs, desc="Epoch", disable=True)
-    set_seed(args)
-    for _ in train_iterator:
-        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True)
-        for step, batch in enumerate(epoch_iterator):
-            source = ([s for s, _ in batch]).to(args.device)
-            target = ([t for _, t in batch]).to(args.device)
-            model.train()
-            outputs = model(source, target, decoder_lm_labels=mask_padding_tokens(target))
-            loss = outputs[0]
-            loss.backward()
-
-            tr_loss += loss.item()
-            if (step + 1) % args.gradient_accumulation_steps == 0:
-                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
-                optimizer.step()
-                scheduler.step()
-                model.zero_grad()
-                global_step += 1
-
-            if args.max_steps > 0 and global_step > args.max_steps:
-                epoch_iterator.close()
-                break
-
-        if args.max_steps > 0 and global_step > args.max_steps:
-            train_iterator.close()
-            break
-
-    return global_step, tr_loss / global_step
-
-
-def main():
-    parser = argparse.ArgumentParser()
-
-    # Required parameters
-    parser.add_argument(
-        "--data_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The input training data file (a text file).",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=None,
-        type=str,
-        required=True,
-        help="The output directory where the model predictions and checkpoints will be written.",
-    )
-
-    # Optional parameters
-    parser.add_argument(
-        "--adam_epsilon", default=1e-8, type=float, help="Epsilon for Adam optimizer."
-    )
-    parser.add_argument(
-        "--model_name_or_path",
-        default="bert-base-cased",
-        type=str,
-        help="The model checkpoint to initialize the encoder and decoder's weights with.",
-    )
-    parser.add_argument(
-        "--model_type",
-        default="bert",
-        type=str,
-        help="The decoder architecture to be fine-tuned.",
-    )
-    parser.add_argument(
-        "--learning_rate",
-        default=5e-5,
-        type=float,
-        help="The initial learning rate for Adam.",
-    )
-    parser.add_argument(
-        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
-    )
-    parser.add_argument(
-        "--max_steps",
-        default=-1,
-        type=int,
-        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
-    )
-    parser.add_argument(
-        "--num_train_epochs",
-        default=1,
-        type=int,
-        help="Total number of training epochs to perform.",
-    )
-    parser.add_argument("--seed", default=42, type=int)
-    parser.add_argument(
-        "--warmup_steps", default=0, type=int, help="Linear warmup over warmup_steps."
-    )
-    parser.add_argument(
-        "--weight_decay", default=0.0, type=float, help="Weight deay if we apply some."
-    )
-    args = parser.parse_args()
-
-    if args.model_type != "bert":
-        raise ValueError(
-            "Only the BERT architecture is currently supported for seq2seq."
-        )
-
-    # Set up training device
-    # device = torch.device("cpu")
-
-    # Set seed
-    set_seed(args)
-
-    # Load pretrained model and tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
-    model = Model2Model.from_pretrained(args.model_name_or_path)
-    # model.to(device)
-
-    logger.info("Training/evaluation parameters %s", args)
-
-    # Training
-    train_dataset = load_and_cache_examples(args, tokenizer)
-    global_step, tr_loss = train(args, train_dataset, model, tokenizer)
-    # logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/run_summarization_finetuning.py b/examples/run_summarization_finetuning.py
new file mode 100644
index 0000000000..64bee82c5b
--- /dev/null
+++ b/examples/run_summarization_finetuning.py
@@ -0,0 +1,620 @@
+# coding=utf-8
+# Copyright 2019 The HuggingFace Inc. team.
+# Copyright (c) 2019 The HuggingFace Inc.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning seq2seq models for sequence generation."""
+
+import argparse
+from collections import deque
+import logging
+import os
+import pickle
+import random
+import sys
+
+import numpy as np
+from tqdm import tqdm, trange
+import torch
+from torch.optim import Adam
+from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
+
+from transformers import AutoTokenizer, PreTrainedSeq2seq, Model2Model
+
+logger = logging.getLogger(__name__)
+logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+
+
+# ------------
+# Load dataset
+# ------------
+
+
+class TextDataset(Dataset):
+    """ Abstracts the dataset used to train seq2seq models.
+
+    CNN/Daily News:
+
+    The CNN/Daily News raw datasets are downloaded from [1]. The stories are
+    stored in different files; the summary appears at the end of the story as
+    sentences that are prefixed by the special `@highlight` line. To process
+    the data, untar both datasets in the same folder, and pass the path to this
+    folder as the "data_dir argument. The formatting code was inspired by [2].
+
+    [1] https://cs.nyu.edu/~kcho/
+    [2] https://github.com/abisee/cnn-dailymail/
+    """
+
+    def __init__(self, tokenizer, prefix="train", data_dir="", block_size=512):
+        assert os.path.isdir(data_dir)
+
+        # Load the features that have already been computed, if any
+        cached_features_file = os.path.join(
+            data_dir, "cached_lm_{}_{}".format(block_size, prefix)
+        )
+        if os.path.exists(cached_features_file):
+            logger.info("Loading features from cached file %s", cached_features_file)
+            with open(cached_features_file, "rb") as source:
+                self.examples = pickle.load(source)
+                return
+
+        logger.info("Creating features from dataset at %s", data_dir)
+        datasets = ["cnn", "dailymail"]
+
+        self.examples = {"source": [], "target": []}
+        for dataset in datasets:
+            path_to_stories = os.path.join(data_dir, dataset, "stories")
+            story_filenames_list = os.listdir(path_to_stories)
+            for story_filename in story_filenames_list:
+                path_to_story = os.path.join(path_to_stories, story_filename)
+                if not os.path.isfile(path_to_story):
+                    continue
+
+                with open(path_to_story, encoding="utf-8") as source:
+                    raw_story = source.read()
+                    story_lines, summary_lines = process_story(raw_story)
+                    if len(summary_lines) == 0 or len(story_lines) == 0:
+                        continue
+
+                story_token_ids, summary_token_ids = _encode_for_summarization(
+                    story_lines, summary_lines, tokenizer
+                )
+                story_seq = _fit_to_block_size(story_token_ids, block_size)
+                self.examples["source"].append(story_seq)
+
+                summary_seq = _fit_to_block_size(summary_token_ids, block_size)
+                self.examples["summary"].append(summary_seq)
+
+        logger.info("Saving features into cache file %s", cached_features_file)
+        with open(cached_features_file, "wb") as sink:
+            pickle.dump(self.examples, sink, protocol=pickle.HIGHEST_PROTOCOL)
+
+    def __len__(self):
+        return len(self.examples)
+
+    def __getitem__(self, items):
+        return (
+            torch.tensor(self.examples["source"][items]),
+            torch.tensor(self.examples["target"][items]),
+        )
+
+
+def process_story(raw_story):
+    """ Extract the story and summary from a story file.
+
+    Attributes:
+        raw_story (str): content of the story file as an utf-8 encoded string.
+
+    Raises:
+        IndexError: If the stoy is empty or contains no highlights.
+    """
+    nonempty_lines = list(
+        filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])
+    )
+
+    # for some unknown reason some lines miss a period, add it
+    nonempty_lines = [_add_missing_period(line) for line in nonempty_lines]
+
+    # gather article lines
+    story_lines = []
+    lines = deque(nonempty_lines)
+    while True:
+        try:
+            element = lines.popleft()
+            if element.startswith("@highlight"):
+                break
+            story_lines.append(element)
+        except IndexError:
+            # if "@highlight" is absent from the file we pop
+            # all elements until there is None.
+            return story_lines, []
+
+    # gather summary lines
+    summary_lines = list(filter(lambda t: not t.startswith("@highlight"), lines))
+
+    return story_lines, summary_lines
+
+
+def _encode_for_summarization(story_lines, summary_lines, tokenizer):
+    """ Encode the story and summary lines, and join them
+    as specified in [1] by using `[SEP] [CLS]` tokens to separate
+    sentences.
+    """
+    story_lines_token_ids = [
+        tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
+        for line in story_lines
+    ]
+    summary_lines_token_ids = [
+        tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
+        for line in summary_lines
+    ]
+
+    story_token_ids = [
+        token for sentence in story_lines_token_ids for token in sentence
+    ]
+    summary_token_ids = [
+        token for sentence in summary_lines_token_ids for token in sentence
+    ]
+
+    return story_token_ids, summary_token_ids
+
+
+def _add_missing_period(line):
+    END_TOKENS = [".", "!", "?", "...", "'", "`", '"', u"\u2019", u"\u2019", ")"]
+    if line.startswith("@highlight"):
+        return line
+    if line[-1] in END_TOKENS:
+        return line
+    return line + "."
+
+
+def _fit_to_block_size(sequence, block_size):
+    """ Adapt the source and target sequences' lengths to the block size.
+    If the sequence is shorter than the block size we pad it with -1 ids
+    which correspond to padding tokens.
+    """
+    if len(sequence) > block_size:
+        return sequence[:block_size]
+    else:
+        sequence.extend([0] * (block_size - len(sequence)))
+        return sequence
+
+
+def mask_padding_tokens(sequence):
+    """ Padding token, encoded as 0, are represented by the value -1 in the
+    masks """
+    padded = sequence.clone()
+    padded[padded == 0] = -1
+    return padded
+
+
+def load_and_cache_examples(args, tokenizer):
+    dataset = TextDataset(tokenizer, data_dir=args.data_dir)
+    return dataset
+
+
+def compute_token_type_ids(batch, separator_token_id):
+    """ Segment embeddings as described in [1]
+
+    The values {0,1} were found in the repository [2].
+
+    Attributes:
+        batch: torch.Tensor, size [batch_size, block_size]
+            Batch of input.
+        separator_token_id: int
+            The value of the token that separates the segments.
+
+    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
+        arXiv preprint arXiv:1908.08345 (2019).
+    [2] https://github.com/nlpyang/PreSumm (/src/prepro/data_builder.py, commit fac1217)
+    """
+    batch_embeddings = []
+    sentence_num = 0
+    for sequence in batch:
+        embeddings = []
+        for s in sequence:
+            if s == separator_token_id:
+                sentence_num += 1
+            embeddings.append(sentence_num % 2)
+        batch_embeddings.append(embeddings)
+    return torch.tensor(batch_embeddings)
+
+
+# ----------
+# Optimizers
+# ----------
+
+
+class BertSumOptimizer(object):
+    """ Specific optimizer for BertSum.
+
+    As described in [1], the authors fine-tune BertSum for abstractive
+    summarization using two Adam Optimizers with different warm-up steps and
+    learning rate. They also use a custom learning rate scheduler.
+
+    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
+        arXiv preprint arXiv:1908.08345 (2019).
+    """
+
+    def __init__(self, model, lr, warmup_steps, beta_1=0.99, beta_2=0.999, eps=1e-9):
+        self.encoder = model.encoder
+        self.decoder = model.decoder
+        self.lr = lr
+        self.warmup_steps = warmup_steps
+
+        self.optimizers = {
+            "encoder": Adam(
+                model.encoder.parameters(),
+                lr=lr["encoder"],
+                betas=(beta_1, beta_2),
+                eps=eps,
+            ),
+            "decoder": Adam(
+                model.decoder.parameters(),
+                lr=lr["decoder"],
+                betas=(beta_1, beta_2),
+                eps=eps,
+            ),
+        }
+
+        self._step = 0
+
+    def _update_rate(self, stack):
+        return self.lr[stack] * min(
+            self._step ** (-0.5), self._step * self.warmup_steps[stack] ** (-0.5)
+        )
+
+    def zero_grad(self):
+        self.optimizer_decoder.zero_grad()
+        self.optimizer_encoder.zero_grad()
+
+    def step(self):
+        self._step += 1
+        for stack, optimizer in self.optimizers.items():
+            new_rate = self._update_rate(stack)
+            for param_group in optimizer.param_groups:
+                param_group["lr"] = new_rate
+            optimizer.step()
+
+
+# ------------
+# Train
+# ------------
+
+
+def train(args, model, tokenizer):
+    """ Fine-tune the pretrained model on the corpus. """
+    set_seed(args)
+
+    # Load the data
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_dataset = load_and_cache_examples(args, tokenizer)
+    train_sampler = RandomSampler(train_dataset)
+    train_dataloader = DataLoader(
+        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size
+    )
+
+    # Training schedule
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = t_total // (
+            len(train_dataloader) // args.gradient_accumulation_steps + 1
+        )
+    else:
+        t_total = (
+            len(train_dataloader)
+            // args.gradient_accumulation_steps
+            * args.num_train_epochs
+        )
+
+    # Prepare the optimizer
+    lr = {"encoder": 0.002, "decoder": 0.2}
+    warmup_steps = {"encoder": 20000, "decoder": 10000}
+    optimizer = BertSumOptimizer(model, lr, warmup_steps)
+
+    # Train
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info(
+        "  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size
+    )
+    logger.info(
+        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
+        args.train_batch_size * args.gradient_accumulation_steps
+        # * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
+    )
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    model.zero_grad()
+    train_iterator = trange(args.num_train_epochs, desc="Epoch", disable=True)
+
+    global_step = 0
+    tr_loss = 0.0
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True)
+        for step, batch in enumerate(epoch_iterator):
+            source, target = batch
+            token_type_ids = compute_token_type_ids(source, tokenizer.cls_token_id)
+            labels_src = mask_padding_tokens(source)
+            labels_tgt = mask_padding_tokens(target)
+
+            source = source.to(args.device)
+            target = target.to(args.device)
+            token_type_ids = token_type_ids.to(args.device)
+            labels_src = labels_src.to(args.device)
+            labels_tgt = labels_tgt.to(args.device)
+
+            model.train()
+            outputs = model(
+                source,
+                target,
+                token_type_ids=token_type_ids,
+                decoder_encoder_attention_mask=labels_src,
+                decoder_attention_mask=labels_tgt,
+                decoder_lm_labels=labels_tgt,
+                decoder_initialize_randomly=True,
+            )
+
+            loss = outputs[0]
+            print(loss)
+            if args.gradient_accumulation_steps > 1:
+                loss /= args.gradient_accumulation_steps
+
+            loss.backward()
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+                optimizer.step()
+                model.zero_grad()
+                global_step += 1
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    return global_step, tr_loss / global_step
+
+
+# ------------
+# Train
+# ------------
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    set_seed(args)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    eval_dataset = load_and_cache_examples(args, tokenizer, evaluate=True)
+    eval_sampler = SequentialSampler(eval_dataset)
+    eval_dataloader = DataLoader(
+        eval_dataset, sampler=eval_sampler, batch_size=args.eval_batch_size
+    )
+
+    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info("  Num examples = %d", len(eval_dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    eval_loss = 0.0
+    nb_eval_steps = 0
+    model.eval()
+
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        source, target = batch
+        labels_src = mask_padding_tokens(source)
+        labels_tgt = mask_padding_tokens(target)
+        source.to(args.device)
+        target.to(args.device)
+        labels_src.to(args.device)
+        labels_tgt.to(args.device)
+
+        with torch.no_grad():
+            outputs = model(
+                source,
+                target,
+                decoder_encoder_attention_mask=labels_src,
+                decoder_attention_mask=labels_tgt,
+                decoder_lm_labels=labels_tgt,
+            )
+            lm_loss = outputs[0]
+            eval_loss += lm_loss.mean().item()
+        nb_eval_steps += 1
+
+    eval_loss = eval_loss / nb_eval_steps
+    perplexity = torch.exp(torch.tensor(eval_loss))
+
+    result = {"perplexity": perplexity}
+
+    # Save the evaluation's results
+    output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    with open(output_eval_file, "w") as writer:
+        logger.info("***** Eval results {} *****".format(prefix))
+        for key in sorted(result.keys()):
+            logger.info("  %s = %s", key, str(result[key]))
+            writer.write("%s = %s\n" % (key, str(result[key])))
+
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    # Required parameters
+    parser.add_argument(
+        "--data_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The input training data file (a text file).",
+    )
+    parser.add_argument(
+        "--output_dir",
+        default=None,
+        type=str,
+        required=True,
+        help="The output directory where the model predictions and checkpoints will be written.",
+    )
+
+    # Optional parameters
+    parser.add_argument(
+        "--gradient_accumulation_steps",
+        type=int,
+        default=1,
+        help="Number of updates steps to accumulate before performing a backward/update pass.",
+    )
+    parser.add_argument(
+        "--do_evaluate",
+        type=bool,
+        default=False,
+        help="Run model evaluation on out-of-sample data.",
+    )
+    parser.add_argument("--do_train", type=bool, default=False, help="Run training.")
+    parser.add_argument(
+        "--do_overwrite_output_dir",
+        type=bool,
+        default=False,
+        help="Whether to overwrite the output dir.",
+    )
+    parser.add_argument(
+        "--model_name_or_path",
+        default="bert-base-cased",
+        type=str,
+        help="The model checkpoint to initialize the encoder and decoder's weights with.",
+    )
+    parser.add_argument(
+        "--model_type",
+        default="bert",
+        type=str,
+        help="The decoder architecture to be fine-tuned.",
+    )
+    parser.add_argument(
+        "--max_grad_norm", default=1.0, type=float, help="Max gradient norm."
+    )
+    parser.add_argument(
+        "--max_steps",
+        default=-1,
+        type=int,
+        help="If > 0: set total number of training steps to perform. Override num_train_epochs.",
+    )
+    parser.add_argument(
+        "--to_cpu", default=False, type=bool, help="Whether to force training on CPU."
+    )
+    parser.add_argument(
+        "--num_train_epochs",
+        default=1,
+        type=int,
+        help="Total number of training epochs to perform.",
+    )
+    parser.add_argument(
+        "--per_gpu_train_batch_size",
+        default=4,
+        type=int,
+        help="Batch size per GPU/CPU for training.",
+    )
+    parser.add_argument("--seed", default=42, type=int)
+    args = parser.parse_args()
+
+    if (
+        os.path.exists(args.output_dir)
+        and os.listdir(args.output_dir)
+        and args.do_train
+        and not args.do_overwrite_output_dir
+    ):
+        raise ValueError(
+            "Output directory ({}) already exists and is not empty. Use --do_overwrite_output_dir to overwrite.".format(
+                args.output_dir
+            )
+        )
+
+    # Set up training device
+    if args.to_cpu or not torch.cuda.is_available():
+        args.device = torch.device("cpu")
+        args.n_gpu = 0
+    else:
+        args.device = torch.device("cuda")
+        args.n_gpu = torch.cuda.device_count()
+
+    # Load pretrained model and tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+    model = Model2Model.from_pretrained(args.model_name_or_path)
+
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        level=logging.INFO,
+    )
+    logger.warning(
+        "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+        0,
+        args.device,
+        args.n_gpu,
+        False,
+        False,
+    )
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Train the model
+    model.to(args.device)
+    if args.do_train:
+        global_step, tr_loss = train(args, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+        if not os.path.exists(args.output_dir):
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = (
+            model.module if hasattr(model, "module") else model
+        )  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+        torch.save(args, os.path.join(args.output_dir, "training_arguments.bin"))
+
+    # Evaluate the model
+    results = {}
+    if args.do_evaluate:
+        checkpoints = []
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+        for checkpoint in checkpoints:
+            encoder_checkpoint = os.path.join(checkpoint, "encoder")
+            decoder_checkpoint = os.path.join(checkpoint, "decoder")
+            model = PreTrainedSeq2seq.from_pretrained(
+                encoder_checkpoint, decoder_checkpoint
+            )
+            model.to(args.device)
+            results = "placeholder"
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/run_seq2seq_finetuning_test.py b/examples/run_summarization_finetuning_test.py
similarity index 79%
rename from examples/run_seq2seq_finetuning_test.py
rename to examples/run_summarization_finetuning_test.py
index 77dc58666c..fd997ee0c2 100644
--- a/examples/run_seq2seq_finetuning_test.py
+++ b/examples/run_summarization_finetuning_test.py
@@ -14,7 +14,7 @@
 # limitations under the License.
 import unittest
 
-from run_seq2seq_finetuning import _fit_to_block_size, process_story
+from run_summarization_finetuning import _fit_to_block_size, process_story
 
 
 class DataLoaderTest(unittest.TestCase):
@@ -43,15 +43,16 @@ class DataLoaderTest(unittest.TestCase):
         raw_story = """It was the year of Our Lord one thousand seven hundred and
         seventy-five.\n\nSpiritual revelations were conceded to England at that
         favoured period, as at this."""
-        with self.assertRaises(IndexError):
-            process_story(raw_story)
+        _, summary = process_story(raw_story)
+        self.assertEqual(summary, [])
 
     def test_process_empty_story(self):
         """ An empty story should also raise and exception.
         """
         raw_story = ""
-        with self.assertRaises(IndexError):
-            process_story(raw_story)
+        story, summary = process_story(raw_story)
+        self.assertEqual(story, [])
+        self.assertEqual(summary, [])
 
     def test_story_with_missing_period(self):
         raw_story = (
@@ -59,17 +60,16 @@ class DataLoaderTest(unittest.TestCase):
             "seventy-five\n\nSpiritual revelations were conceded to England "
             "at that favoured period, as at this.\n@highlight\n\nIt was the best of times"
         )
-        story, summary = process_story(raw_story)
+        story_lines, summary_lines = process_story(raw_story)
 
-        expected_story = (
-            "It was the year of Our Lord one thousand seven hundred and "
-            "seventy-five. Spiritual revelations were conceded to England at that "
-            "favoured period, as at this."
-        )
-        self.assertEqual(expected_story, story)
+        expected_story_lines = [
+            "It was the year of Our Lord one thousand seven hundred and seventy-five.",
+            "Spiritual revelations were conceded to England at that favoured period, as at this.",
+        ]
+        self.assertEqual(expected_story_lines, story_lines)
 
-        expected_summary = "It was the best of times."
-        self.assertEqual(expected_summary, summary)
+        expected_summary_lines = ["It was the best of times."]
+        self.assertEqual(expected_summary_lines, summary_lines)
 
 
 if __name__ == "__main__":
diff --git a/transformers/__init__.py b/transformers/__init__.py
index ee8e812a23..2206a0302e 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -87,7 +87,7 @@ if is_torch_available():
     from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
                                 DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
                                 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_seq2seq import Model2Model
+    from .modeling_seq2seq import PreTrainedSeq2seq, Model2Model
 
     # Optimization
     from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
diff --git a/transformers/modeling_beam_search.py b/transformers/modeling_beam_search.py
new file mode 100644
index 0000000000..3a27625f90
--- /dev/null
+++ b/transformers/modeling_beam_search.py
@@ -0,0 +1,240 @@
+# coding=utf-8
+# Copyright (c) 2019 Yang Liu
+
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+A general wrapper around models with LM heads to generate sequences
+using beam search.
+"""
+import torch
+from torch import nn
+
+
+class ModelWithBeamSearch(nn.Module):
+    def __init__(
+        self,
+        model,
+        beam_size,
+        start_token_id,
+        end_token_id,
+        pad_token_id,
+        min_length,
+        max_length,
+        alpha,
+        block_trigram=True,
+    ):
+        """
+        Attributes:
+            mask_word_id: token id that corresponds to the mask
+        """
+        super(ModelWithBeamSearch, self).__init__()
+        self.model = model
+        self.beam_size = beam_size
+        self.start_token_id = start_token_id
+        self.end_token_id = end_token_id
+        self.pad_token_id = pad_token_id
+        self.min_length = min_length
+        self.max_length = max_length
+        self.alpha = alpha
+        self.block_trigram = block_trigram
+
+    def forward(self, input_ids, **kwargs):
+        # Separate the encoder- and decoder- specific kwargs. A kwarg is
+        # decoder-specific it the key starts with `decoder_`
+        kwargs_encoder = {
+            argument: value
+            for argument, value in kwargs.items()
+            if not argument.startswith("decoder_")
+        }
+        kwargs_decoder = {
+            argument[len("decoder_"):]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("decoder_")
+        }
+
+        batch_size, _ = input_ids.size(0)
+
+        # Variables that keep track of the status of the search
+        hypotheses = [[] for _ in range(batch_size)]
+        batch_offset = torch.arange(batch_size, dtype=torch.long)
+        beam_offset = torch.arange(
+            0,
+            batch_size * self.beam_size,
+            step=self.beam_size,
+            dtype=torch.long,
+        )
+        growing_beam = torch.full(
+            (batch_size * self.beam_size, 1),
+            self.start_token_id,
+            dtype=torch.long,
+        )
+        topk_log_probabilities = torch.tensor(
+            [0.0] + [float("-inf")] * (self.beam_size - 1),
+            dtype=torch.float,
+        ).repeat(batch_size)
+
+        # Forward pass on the encoder
+        encoder_outputs = self.encoder(input_ids, kwargs_encoder)
+        kwargs_decoder["encoder_hidden_states"] = tile(
+            encoder_outputs, self.beam_size, dim=0
+        )
+
+        results = {}
+        results["predictions"] = [[] for _ in batch_size]
+        results["scores"] = [[] for _ in batch_size]
+
+        for step in range(self.max_length):
+            decoder_input = growing_beam[:, -1]
+            outputs = self.decoder(decoder_input, kwargs_decoder)
+            log_probabilities = torch.nn.functional.log_softmax(outputs[1])
+            vocab_size = log_probabilities.size(-1)
+
+            # The batch size changes as some beams finish so we define:
+            _B = log_probabilities.size(0) // self.beam_size
+
+            # Multiply each beam probability with the probability of the
+            # next token (conditioned on the words in the beam).
+            log_probabilities += topk_log_probabilities.view(-1, 1)
+
+            # if the beam has not attained the minimum required length we
+            # make the end token arbitrarily unlikely.
+            if step < self.min_length:
+                log_probabilities[self.end_token_id] = -1e20
+
+            # Remove repeating tri-grams
+            if(self.args.block_trigram):
+                if(step + 1 > 3):
+                    for i in range(_B * self.beam_size):
+                        tokens = [t for t in growing_beam[i]]
+                        trigrams = [(tokens[i-1], tokens[i], tokens[i+1]) for i in range(1, len(words) - 1)]
+                        last_trigram = tuple(trigrams[-1])
+                        if last_trigram in trigrams[:-1]:
+                            log_probabilities[i] = -1e20
+
+            # Find the `beam_size` (previous_beam + token) combinations with
+            # the highest score
+            topk_log_probabilities, topk_ids = log_probabilities.topk(
+                log_probabilities.view(_B, self.beam_size * vocab_size),
+                self.beam_size,
+                dim=1
+            )
+
+            # Apply the length penalty. The +1 accounts for the [EOS] token
+            # that will be added if the beam ends.
+            length_penalty = ((5.0 + (step + 1)) / 6.0) ** self.alpha
+            topk_scores = topk_log_probabilities / length_penalty
+
+            # Retrieve the corresponding respective beam and token id
+            # topk_token_ids[i] will be added to topk_beam_ids[i]
+            topk_beam_ids = topk_ids.div(vocab_size)
+            topk_token_ids = topk_ids.fmod(vocab_size)
+
+            # Retrieve the row index of the surviving beams in the original
+            # view of the log_probabilities tensor
+            surviving_beams_rows = (
+                topk_beam_ids + beam_offset[:_B].view(-1, 1)
+            ).view(-1)
+
+            # Append the last predictions
+            growing_beam = torch.cat(
+                [
+                    growing_beam.index_select(0, surviving_beams_rows),
+                    topk_token_ids.view(-1, 1),
+                ],
+                1,
+            )
+
+            # Check if any of the beam searches has ended during this
+            # growth step. Also if top beam (most probable) has ended
+            # for one element of the batch.
+            is_finished = topk_token_ids.eq(self.end_token_id)
+            if step + 1 == self.max_length:
+                is_finished.fill_(1)
+            is_top_beam_finished = is_finished[:, 0].eq(1)
+
+            # Save the finished searches
+            if is_finished.any():
+                predictions = growing_beam.view(-1, self.beam_size, growing_beam.size(1))
+                for i in range(is_finished.size(0)):
+                    if is_top_beam_finished[i]:
+                        is_finished[i].fill_(1)
+                    finished_hyp = is_finished[i].nonzero().view(-1)
+
+                    # Store finished hypotheses for this batch.
+                    b = batch_offset[i]
+                    for j in finished_hyp:
+                        hypotheses[b].append((topk_scores[i, j], predictions[i, j, :]))
+
+                    # If the batch reached the end, save the best hypotheses
+                    # in terms of length-penalized score.
+                    if is_top_beam_finished[i]:
+                        best_hyp = sorted(
+                            hypotheses[b], key=lambda x: x[0], reverse=True
+                        )
+                        best_score, best_prediction = best_hyp[0]
+                        results["scores"][b].append(best_score)
+                        results["predictions"][b].append(best_prediction)
+
+                non_finished = is_top_beam_finished.eq(0).nonzero().view(-1)
+                if len(non_finished) == 0:
+                    break
+
+                # Remove finished batches for the next step.
+                topk_log_probabilities = topk_log_probabilities.index_select(0, non_finished)
+                batch_offset = batch_offset.index_select(0, non_finished)
+                growing_beam = predictions.index_select(0, non_finished).view(
+                    -1, growing_beam.size(-1)
+                )
+
+            # Re-order the state for the next pass
+            surviving_beams_rows = surviving_beams_rows.index_select(0, non_finished)
+            kwargs_decoder["encoder_hidden_states"] = kwargs_decoder[
+                "encoder_hidden_states"
+            ].index_select(0, surviving_beams_rows)
+
+        return results
+
+
+def tile(x, count, dim=0):
+    """
+    Tiles `x` along dimension `dim` `count` times.
+
+    Example:
+        >> ex = torch.tensor([1,2],[3,4])
+        >> tile(ex, 2, 0)
+        torch.Tensor([[1,2],[1,2],[3,4],[3,4]])
+    """
+    perm = list(range(len(x.size())))
+    if dim != 0:
+        perm[0], perm[dim] = perm[dim], perm[0]
+        x = x.permute(perm).contiguous()
+    out_size = list(x.size())
+    out_size[0] *= count
+    batch = x.size(0)
+    x = (
+        x.view(batch, -1)
+        .transpose(0, 1)
+        .repeat(count, 1)
+        .transpose(0, 1)
+        .contiguous()
+        .view(*out_size)
+    )
+    if dim != 0:
+        x = x.permute(perm).contiguous()
+    return x
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index d10f32c1fa..93f3c7e1f1 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -646,7 +646,7 @@ class BertModel(BertPreTrainedModel):
         if attention_mask.dim() == 2:
             if self.config.is_decoder:
                 batch_size, seq_length = input_ids.size()
-                seq_ids = torch.arange(seq_length)
+                seq_ids = torch.arange(seq_length, device=input_ids.device)
                 causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
                 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
             else:
@@ -660,6 +660,13 @@ class BertModel(BertPreTrainedModel):
         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
+        # If a 2D encoder attention mask is provided for the cross-attention
+        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_attention_mask is not None:
+            encoder_attention_mask = encoder_attention_mask[:, None, None, :]
+            encoder_attention_mask = encoder_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+            encoder_attention_mask = (1.0 - encoder_attention_mask) * -10000.0
+
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
         # attention_probs has shape bsz x n_heads x N x N
@@ -819,7 +826,7 @@ class BertForMaskedLM(BertPreTrainedModel):
                                    self.bert.embeddings.word_embeddings)
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                masked_lm_labels=None, lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None):
+                masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ):
 
         outputs = self.bert(input_ids,
                             attention_mask=attention_mask,
@@ -838,11 +845,8 @@ class BertForMaskedLM(BertPreTrainedModel):
         # 1. If a tensor that contains the indices of masked labels is provided,
         #    the cross-entropy is the MLM cross-entropy that measures the likelihood
         #    of predictions for masked words.
-        # 2. If encoder hidden states are provided we are in a causal situation where we
+        # 2. If `lm_label` is provided we are in a causal scenario where we
         #    try to predict the next word for each input in the encoder.
-        if masked_lm_labels is not None and lm_labels is not None:
-            raise AttributeError("Masked LM training with an encoder-decoder is not supported.")
-
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)  # -1 index = padding token
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
@@ -851,9 +855,9 @@ class BertForMaskedLM(BertPreTrainedModel):
         if lm_labels is not None:
             # we are doing next-token prediction; shift prediction scores and input ids by one
             prediction_scores = prediction_scores[:, :-1, :]
-            lm_labels = lm_labels[:, 1:, :]
+            lm_labels = lm_labels[:, 1:]
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            seq2seq_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1))
+            seq2seq_loss = loss_fct(prediction_scores.reshape(-1, self.config.vocab_size), lm_labels.reshape(-1))
             outputs = (seq2seq_loss,) + outputs
 
         return outputs  # (mlm_or_seq2seq_loss), prediction_scores, (hidden_states), (attentions)
diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
index 108fdaa853..2767dd2cd1 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_seq2seq.py
@@ -17,13 +17,12 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import logging
+import os
 
 import torch
 from torch import nn
 
-from .file_utils import add_start_docstrings
 from .modeling_auto import AutoModel, AutoModelWithLMHead
-from .modeling_utils import PreTrainedModel, SequenceSummary
 
 logger = logging.getLogger(__name__)
 
@@ -43,7 +42,13 @@ class PreTrainedSeq2seq(nn.Module):
         self.decoder = decoder
 
     @classmethod
-    def from_pretrained(cls, encoder_pretrained_model_name_or_path=None, decoder_pretrained_model_name_or_path=None, *model_args, **kwargs):
+    def from_pretrained(
+        cls,
+        encoder_pretrained_model_name_or_path=None,
+        decoder_pretrained_model_name_or_path=None,
+        *model_args,
+        **kwargs
+    ):
         r""" Instantiates an encoder and a decoder from one or two base classes
         of the library from pre-trained model checkpoints.
 
@@ -108,23 +113,28 @@ class PreTrainedSeq2seq(nn.Module):
 
         # Separate the encoder- and decoder- specific kwargs. A kwarg is
         # decoder-specific it the key starts with `decoder_`
-        kwargs_decoder = {}
-        kwargs_encoder = kwargs
-        for key in kwargs_encoder.keys():
-            if key.startswith("decoder_"):
-                kwargs_decoder[key.replace("decoder_", "")] = kwargs_encoder.pop(key)
+        kwargs_encoder = {
+            argument: value
+            for argument, value in kwargs.items()
+            if not argument.startswith("decoder_")
+        }
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("decoder_")
+        }
 
         # Load and initialize the encoder and decoder
-        #  The distinction between encoder and decoder at the model level is made
-        #  by the value of the flag `is_decoder` that we need to set correctly.
-        encoder = kwargs.pop("encoder_model", None)
+        # The distinction between encoder and decoder at the model level is made
+        # by the value of the flag `is_decoder` that we need to set correctly.
+        encoder = kwargs_encoder.pop("encoder_model", None)
         if encoder is None:
             kwargs_encoder["is_decoder"] = False
             encoder = AutoModel.from_pretrained(
                 encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder
             )
 
-        decoder = kwargs.pop("decoder_model", None)
+        decoder = kwargs_decoder.pop("model", None)
         if decoder is None:
             kwargs_decoder["is_decoder"] = True
             decoder = AutoModelWithLMHead.from_pretrained(
@@ -135,6 +145,12 @@ class PreTrainedSeq2seq(nn.Module):
 
         return model
 
+    def save_pretrained(self, save_directory):
+        """ Save a Seq2Seq model and its configuration file in a format
+        such that it can be loaded using `:func:`~transformers.PreTrainedSeq2seq.from_pretrained` """
+        self.encoder.save_pretrained(os.path.join(save_directory, "encoder"))
+        self.decoder.save_pretrained(os.path.join(save_directory, "decoder"))
+
     def forward(self, encoder_input_ids, decoder_input_ids, **kwargs):
         """ The forward pass on a seq2eq depends what we are performing:
 
@@ -155,22 +171,29 @@ class PreTrainedSeq2seq(nn.Module):
         """
         # Separate the encoder- and decoder- specific kwargs. A kwarg is
         # decoder-specific it the key starts with `decoder_`
-        kwargs_decoder = {}
-        kwargs_encoder = kwargs
-        for key in kwargs_encoder.keys():
-            if key.startswith("decoder_"):
-                kwargs_decoder[key.replace("decoder_", "")] = kwargs_encoder.pop(key)
+        kwargs_encoder = {
+            argument: value
+            for argument, value in kwargs.items()
+            if not argument.startswith("decoder_")
+        }
+        kwargs_decoder = {
+            argument[len("decoder_") :]: value
+            for argument, value in kwargs.items()
+            if argument.startswith("decoder_")
+        }
 
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("encoder_hidden_states", None)
         if encoder_hidden_states is None:
             encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
-            encoder_hidden_states = encoder_outputs[0][-1]  # output of the encoder *stack*
+            encoder_hidden_states = encoder_outputs[0][
+                -1
+            ]  # output of the encoder *stack*
         else:
             encoder_outputs = ()
 
         # Decode
-        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states[None, :, :]
         decoder_outputs = self.decoder(decoder_input_ids, **kwargs_decoder)
 
         return decoder_outputs + encoder_outputs
@@ -201,9 +224,25 @@ class Model2Model(PreTrainedSeq2seq):
 
     @classmethod
     def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
-        model = super(Model2Model, cls).from_pretrained(encoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
-                                                        decoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
-                                                        **kwargs)
+
+        if (
+            "bert" not in pretrained_model_name_or_path
+            or "roberta" in pretrained_model_name_or_path
+            or "distilbert" in pretrained_model_name_or_path
+        ):
+            raise ValueError("Only the Bert model is currently supported.")
+
+        model = super(Model2Model, cls).from_pretrained(
+            encoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
+            decoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
+            **kwargs
+        )
+
+        # Some architectures require for the decoder to be initialized randomly
+        # before fine-tuning.
+        if kwargs.get("decoder_initialize_randomly", False):
+            model.decoder.init_weights()
+
         return model
 
 

From 438f2730a03e19bc21f2823c659ceaed0dfe8ef7 Mon Sep 17 00:00:00 2001
From: altsoph <altsoph@gmail.com>
Date: Fri, 25 Oct 2019 13:22:58 +0300
Subject: [PATCH 123/144] Evaluation code fixed.

---
 examples/run_lm_finetuning.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 571bcb4391..4d32385e40 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -86,6 +86,7 @@ class TextDataset(Dataset):
             # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
             # If your dataset is small, first you should loook for a bigger one :-) and second you
             # can change this behavior by adding (model specific) padding.
+            self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[-block_size:])) # DIRTY!
 
             logger.info("Saving features into cached file %s", cached_features_file)
             with open(cached_features_file, 'wb') as handle:
@@ -309,10 +310,12 @@ def evaluate(args, model, tokenizer, prefix=""):
     model.eval()
 
     for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        batch = batch.to(args.device)
+        inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
+        inputs = inputs.to(args.device)
+        labels = labels.to(args.device)
 
         with torch.no_grad():
-            outputs = model(batch, masked_lm_labels=batch) if args.mlm else model(batch, labels=batch)
+            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
             lm_loss = outputs[0]
             eval_loss += lm_loss.mean().item()
         nb_eval_steps += 1
@@ -540,4 +543,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
+    main()
\ No newline at end of file

From 079bfb32fba4f2b39d344ca7af88d79a3ff27c7c Mon Sep 17 00:00:00 2001
From: altsoph <altsoph@gmail.com>
Date: Fri, 25 Oct 2019 13:26:37 +0300
Subject: [PATCH 124/144] Evaluation fixed.

---
 examples/run_lm_finetuning.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/run_lm_finetuning.py b/examples/run_lm_finetuning.py
index 4d32385e40..982d8aa1b7 100644
--- a/examples/run_lm_finetuning.py
+++ b/examples/run_lm_finetuning.py
@@ -86,7 +86,6 @@ class TextDataset(Dataset):
             # Note that we are loosing the last truncated example here for the sake of simplicity (no padding)
             # If your dataset is small, first you should loook for a bigger one :-) and second you
             # can change this behavior by adding (model specific) padding.
-            self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[-block_size:])) # DIRTY!
 
             logger.info("Saving features into cached file %s", cached_features_file)
             with open(cached_features_file, 'wb') as handle:
@@ -543,4 +542,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()

From dfce40969141eb037e8af3ed64e490a876386bf5 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Tue, 29 Oct 2019 17:10:20 +0100
Subject: [PATCH 125/144] resolve PR comments

---
 examples/run_summarization_finetuning.py      | 292 +++++-----------
 examples/run_summarization_finetuning_test.py |  76 ----
 examples/utils_summarization.py               | 184 ++++++++++
 examples/utils_summarization_test.py          | 133 +++++++
 transformers/modeling_beam_search.py          | 325 ++++++++++--------
 transformers/modeling_bert.py                 |  31 +-
 transformers/modeling_seq2seq.py              |  79 +++--
 7 files changed, 647 insertions(+), 473 deletions(-)
 delete mode 100644 examples/run_summarization_finetuning_test.py
 create mode 100644 examples/utils_summarization.py
 create mode 100644 examples/utils_summarization_test.py

diff --git a/examples/run_summarization_finetuning.py b/examples/run_summarization_finetuning.py
index 64bee82c5b..1888f56caf 100644
--- a/examples/run_summarization_finetuning.py
+++ b/examples/run_summarization_finetuning.py
@@ -16,10 +16,9 @@
 """ Finetuning seq2seq models for sequence generation."""
 
 import argparse
-from collections import deque
+import functools
 import logging
 import os
-import pickle
 import random
 import sys
 
@@ -29,7 +28,22 @@ import torch
 from torch.optim import Adam
 from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
 
-from transformers import AutoTokenizer, PreTrainedSeq2seq, Model2Model
+from transformers import (
+    AutoTokenizer,
+    BertForMaskedLM,
+    BertConfig,
+    PreTrainedSeq2seq,
+    Model2Model,
+)
+
+from utils_summarization import (
+    CNNDailyMailDataset,
+    encode_for_summarization,
+    fit_to_block_size,
+    build_lm_labels,
+    build_mask,
+    compute_token_type_ids,
+)
 
 logger = logging.getLogger(__name__)
 logging.basicConfig(stream=sys.stdout, level=logging.INFO)
@@ -46,194 +60,41 @@ def set_seed(args):
 # ------------
 
 
-class TextDataset(Dataset):
-    """ Abstracts the dataset used to train seq2seq models.
-
-    CNN/Daily News:
-
-    The CNN/Daily News raw datasets are downloaded from [1]. The stories are
-    stored in different files; the summary appears at the end of the story as
-    sentences that are prefixed by the special `@highlight` line. To process
-    the data, untar both datasets in the same folder, and pass the path to this
-    folder as the "data_dir argument. The formatting code was inspired by [2].
-
-    [1] https://cs.nyu.edu/~kcho/
-    [2] https://github.com/abisee/cnn-dailymail/
-    """
-
-    def __init__(self, tokenizer, prefix="train", data_dir="", block_size=512):
-        assert os.path.isdir(data_dir)
-
-        # Load the features that have already been computed, if any
-        cached_features_file = os.path.join(
-            data_dir, "cached_lm_{}_{}".format(block_size, prefix)
-        )
-        if os.path.exists(cached_features_file):
-            logger.info("Loading features from cached file %s", cached_features_file)
-            with open(cached_features_file, "rb") as source:
-                self.examples = pickle.load(source)
-                return
-
-        logger.info("Creating features from dataset at %s", data_dir)
-        datasets = ["cnn", "dailymail"]
-
-        self.examples = {"source": [], "target": []}
-        for dataset in datasets:
-            path_to_stories = os.path.join(data_dir, dataset, "stories")
-            story_filenames_list = os.listdir(path_to_stories)
-            for story_filename in story_filenames_list:
-                path_to_story = os.path.join(path_to_stories, story_filename)
-                if not os.path.isfile(path_to_story):
-                    continue
-
-                with open(path_to_story, encoding="utf-8") as source:
-                    raw_story = source.read()
-                    story_lines, summary_lines = process_story(raw_story)
-                    if len(summary_lines) == 0 or len(story_lines) == 0:
-                        continue
-
-                story_token_ids, summary_token_ids = _encode_for_summarization(
-                    story_lines, summary_lines, tokenizer
-                )
-                story_seq = _fit_to_block_size(story_token_ids, block_size)
-                self.examples["source"].append(story_seq)
-
-                summary_seq = _fit_to_block_size(summary_token_ids, block_size)
-                self.examples["summary"].append(summary_seq)
-
-        logger.info("Saving features into cache file %s", cached_features_file)
-        with open(cached_features_file, "wb") as sink:
-            pickle.dump(self.examples, sink, protocol=pickle.HIGHEST_PROTOCOL)
-
-    def __len__(self):
-        return len(self.examples)
-
-    def __getitem__(self, items):
-        return (
-            torch.tensor(self.examples["source"][items]),
-            torch.tensor(self.examples["target"][items]),
-        )
-
-
-def process_story(raw_story):
-    """ Extract the story and summary from a story file.
-
-    Attributes:
-        raw_story (str): content of the story file as an utf-8 encoded string.
-
-    Raises:
-        IndexError: If the stoy is empty or contains no highlights.
-    """
-    nonempty_lines = list(
-        filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])
-    )
-
-    # for some unknown reason some lines miss a period, add it
-    nonempty_lines = [_add_missing_period(line) for line in nonempty_lines]
-
-    # gather article lines
-    story_lines = []
-    lines = deque(nonempty_lines)
-    while True:
-        try:
-            element = lines.popleft()
-            if element.startswith("@highlight"):
-                break
-            story_lines.append(element)
-        except IndexError:
-            # if "@highlight" is absent from the file we pop
-            # all elements until there is None.
-            return story_lines, []
-
-    # gather summary lines
-    summary_lines = list(filter(lambda t: not t.startswith("@highlight"), lines))
-
-    return story_lines, summary_lines
-
-
-def _encode_for_summarization(story_lines, summary_lines, tokenizer):
-    """ Encode the story and summary lines, and join them
-    as specified in [1] by using `[SEP] [CLS]` tokens to separate
-    sentences.
-    """
-    story_lines_token_ids = [
-        tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
-        for line in story_lines
-    ]
-    summary_lines_token_ids = [
-        tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
-        for line in summary_lines
-    ]
-
-    story_token_ids = [
-        token for sentence in story_lines_token_ids for token in sentence
-    ]
-    summary_token_ids = [
-        token for sentence in summary_lines_token_ids for token in sentence
-    ]
-
-    return story_token_ids, summary_token_ids
-
-
-def _add_missing_period(line):
-    END_TOKENS = [".", "!", "?", "...", "'", "`", '"', u"\u2019", u"\u2019", ")"]
-    if line.startswith("@highlight"):
-        return line
-    if line[-1] in END_TOKENS:
-        return line
-    return line + "."
-
-
-def _fit_to_block_size(sequence, block_size):
-    """ Adapt the source and target sequences' lengths to the block size.
-    If the sequence is shorter than the block size we pad it with -1 ids
-    which correspond to padding tokens.
-    """
-    if len(sequence) > block_size:
-        return sequence[:block_size]
-    else:
-        sequence.extend([0] * (block_size - len(sequence)))
-        return sequence
-
-
-def mask_padding_tokens(sequence):
-    """ Padding token, encoded as 0, are represented by the value -1 in the
-    masks """
-    padded = sequence.clone()
-    padded[padded == 0] = -1
-    return padded
-
-
 def load_and_cache_examples(args, tokenizer):
-    dataset = TextDataset(tokenizer, data_dir=args.data_dir)
+    dataset = CNNDailyMailDataset(tokenizer, data_dir=args.data_dir)
     return dataset
 
 
-def compute_token_type_ids(batch, separator_token_id):
-    """ Segment embeddings as described in [1]
+def collate(data, tokenizer, block_size):
+    """ List of tuple as an input. """
+    # remove the files with empty an story/summary, encode and fit to block
+    data = filter(lambda x: not (len(x[0]) == 0 or len(x[1]) == 0), data)
+    data = [
+        encode_for_summarization(story, summary, tokenizer) for story, summary in data
+    ]
+    data = [
+        (
+            fit_to_block_size(story, block_size, tokenizer.pad_token_id),
+            fit_to_block_size(summary, block_size, tokenizer.pad_token_id),
+        )
+        for story, summary in data
+    ]
 
-    The values {0,1} were found in the repository [2].
+    stories = torch.tensor([story for story, summary in data])
+    summaries = torch.tensor([summary for story, summary in data])
+    encoder_token_type_ids = compute_token_type_ids(stories, tokenizer.cls_token_id)
+    encoder_mask = build_mask(stories, tokenizer.pad_token_id)
+    decoder_mask = build_mask(summaries, tokenizer.pad_token_id)
+    lm_labels = build_lm_labels(summaries, tokenizer.pad_token_id)
 
-    Attributes:
-        batch: torch.Tensor, size [batch_size, block_size]
-            Batch of input.
-        separator_token_id: int
-            The value of the token that separates the segments.
-
-    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
-        arXiv preprint arXiv:1908.08345 (2019).
-    [2] https://github.com/nlpyang/PreSumm (/src/prepro/data_builder.py, commit fac1217)
-    """
-    batch_embeddings = []
-    sentence_num = 0
-    for sequence in batch:
-        embeddings = []
-        for s in sequence:
-            if s == separator_token_id:
-                sentence_num += 1
-            embeddings.append(sentence_num % 2)
-        batch_embeddings.append(embeddings)
-    return torch.tensor(batch_embeddings)
+    return (
+        stories,
+        summaries,
+        encoder_token_type_ids,
+        encoder_mask,
+        decoder_mask,
+        lm_labels,
+    )
 
 
 # ----------
@@ -252,7 +113,7 @@ class BertSumOptimizer(object):
         arXiv preprint arXiv:1908.08345 (2019).
     """
 
-    def __init__(self, model, lr, warmup_steps, beta_1=0.99, beta_2=0.999, eps=1e-9):
+    def __init__(self, model, lr, warmup_steps, beta_1=0.99, beta_2=0.999, eps=1e-8):
         self.encoder = model.encoder
         self.decoder = model.decoder
         self.lr = lr
@@ -306,8 +167,12 @@ def train(args, model, tokenizer):
     args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
     train_dataset = load_and_cache_examples(args, tokenizer)
     train_sampler = RandomSampler(train_dataset)
+    model_collate_fn = functools.partial(collate, tokenizer=tokenizer, block_size=512)
     train_dataloader = DataLoader(
-        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size
+        train_dataset,
+        sampler=train_sampler,
+        batch_size=args.train_batch_size,
+        collate_fn=model_collate_fn,
     )
 
     # Training schedule
@@ -351,26 +216,23 @@ def train(args, model, tokenizer):
     for _ in train_iterator:
         epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=True)
         for step, batch in enumerate(epoch_iterator):
-            source, target = batch
-            token_type_ids = compute_token_type_ids(source, tokenizer.cls_token_id)
-            labels_src = mask_padding_tokens(source)
-            labels_tgt = mask_padding_tokens(target)
+            source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
 
             source = source.to(args.device)
             target = target.to(args.device)
-            token_type_ids = token_type_ids.to(args.device)
-            labels_src = labels_src.to(args.device)
-            labels_tgt = labels_tgt.to(args.device)
+            encoder_token_type_ids = encoder_token_type_ids.to(args.device)
+            encoder_mask = encoder_mask.to(args.device)
+            decoder_mask = decoder_mask.to(args.device)
+            lm_labels = lm_labels.to(args.device)
 
             model.train()
             outputs = model(
                 source,
                 target,
-                token_type_ids=token_type_ids,
-                decoder_encoder_attention_mask=labels_src,
-                decoder_attention_mask=labels_tgt,
-                decoder_lm_labels=labels_tgt,
-                decoder_initialize_randomly=True,
+                encoder_token_type_ids=encoder_token_type_ids,
+                encoder_attention_mask=encoder_mask,
+                decoder_attention_mask=decoder_mask,
+                decoder_lm_labels=lm_labels,
             )
 
             loss = outputs[0]
@@ -421,21 +283,23 @@ def evaluate(args, model, tokenizer, prefix=""):
     model.eval()
 
     for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        source, target = batch
-        labels_src = mask_padding_tokens(source)
-        labels_tgt = mask_padding_tokens(target)
-        source.to(args.device)
-        target.to(args.device)
-        labels_src.to(args.device)
-        labels_tgt.to(args.device)
+        source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
+
+        source = source.to(args.device)
+        target = target.to(args.device)
+        encoder_token_type_ids = encoder_token_type_ids.to(args.device)
+        encoder_mask = encoder_mask.to(args.device)
+        decoder_mask = decoder_mask.to(args.device)
+        lm_labels = lm_labels.to(args.device)
 
         with torch.no_grad():
             outputs = model(
                 source,
                 target,
-                decoder_encoder_attention_mask=labels_src,
-                decoder_attention_mask=labels_tgt,
-                decoder_lm_labels=labels_tgt,
+                encoder_token_type_ids=encoder_token_type_ids,
+                encoder_attention_mask=encoder_mask,
+                decoder_attention_mask=decoder_mask,
+                decoder_lm_labels=lm_labels,
             )
             lm_loss = outputs[0]
             eval_loss += lm_loss.mean().item()
@@ -525,7 +389,7 @@ def main():
     )
     parser.add_argument(
         "--num_train_epochs",
-        default=1,
+        default=10,
         type=int,
         help="Total number of training epochs to perform.",
     )
@@ -558,9 +422,13 @@ def main():
         args.device = torch.device("cuda")
         args.n_gpu = torch.cuda.device_count()
 
-    # Load pretrained model and tokenizer
+    # Load pretrained model and tokenizer. The decoder's weights are randomly initialized.
     tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
-    model = Model2Model.from_pretrained(args.model_name_or_path)
+    config = BertConfig.from_pretrained(args.model_name_or_path)
+    decoder_model = BertForMaskedLM(config)
+    model = Model2Model.from_pretrained(
+        args.model_name_or_path, decoder_model=decoder_model
+    )
 
     # Setup logging
     logging.basicConfig(
diff --git a/examples/run_summarization_finetuning_test.py b/examples/run_summarization_finetuning_test.py
deleted file mode 100644
index fd997ee0c2..0000000000
--- a/examples/run_summarization_finetuning_test.py
+++ /dev/null
@@ -1,76 +0,0 @@
-# coding=utf-8
-# Copyright 2019 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import unittest
-
-from run_summarization_finetuning import _fit_to_block_size, process_story
-
-
-class DataLoaderTest(unittest.TestCase):
-    def setUp(self):
-        self.block_size = 10
-
-    def test_truncate_sequence_too_small(self):
-        """ Pad the sequence with 0 if the sequence is smaller than the block size."""
-        sequence = [1, 2, 3, 4]
-        expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
-        self.assertEqual(_fit_to_block_size(sequence, self.block_size), expected_output)
-
-    def test_truncate_sequence_fit_exactly(self):
-        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(_fit_to_block_size(sequence, self.block_size), expected_output)
-
-    def test_truncate_sequence_too_big(self):
-        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
-        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
-        self.assertEqual(_fit_to_block_size(sequence, self.block_size), expected_output)
-
-    def test_process_story_no_highlights(self):
-        """ Processing a story with no highlights should raise an exception.
-        """
-        raw_story = """It was the year of Our Lord one thousand seven hundred and
-        seventy-five.\n\nSpiritual revelations were conceded to England at that
-        favoured period, as at this."""
-        _, summary = process_story(raw_story)
-        self.assertEqual(summary, [])
-
-    def test_process_empty_story(self):
-        """ An empty story should also raise and exception.
-        """
-        raw_story = ""
-        story, summary = process_story(raw_story)
-        self.assertEqual(story, [])
-        self.assertEqual(summary, [])
-
-    def test_story_with_missing_period(self):
-        raw_story = (
-            "It was the year of Our Lord one thousand seven hundred and "
-            "seventy-five\n\nSpiritual revelations were conceded to England "
-            "at that favoured period, as at this.\n@highlight\n\nIt was the best of times"
-        )
-        story_lines, summary_lines = process_story(raw_story)
-
-        expected_story_lines = [
-            "It was the year of Our Lord one thousand seven hundred and seventy-five.",
-            "Spiritual revelations were conceded to England at that favoured period, as at this.",
-        ]
-        self.assertEqual(expected_story_lines, story_lines)
-
-        expected_summary_lines = ["It was the best of times."]
-        self.assertEqual(expected_summary_lines, summary_lines)
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/examples/utils_summarization.py b/examples/utils_summarization.py
new file mode 100644
index 0000000000..cd8bc4bc2b
--- /dev/null
+++ b/examples/utils_summarization.py
@@ -0,0 +1,184 @@
+from collections import deque
+import os
+
+import torch
+from torch.utils.data import Dataset
+
+
+# ------------
+# Data loading
+# ------------
+
+
+class CNNDailyMailDataset(Dataset):
+    """ Abstracts the dataset used to train seq2seq models.
+
+    CNN/Daily News:
+
+    The CNN/Daily News raw datasets are downloaded from [1]. The stories are
+    stored in different files; the summary appears at the end of the story as
+    sentences that are prefixed by the special `@highlight` line. To process
+    the data, untar both datasets in the same folder, and pass the path to this
+    folder as the "data_dir argument. The formatting code was inspired by [2].
+
+    [1] https://cs.nyu.edu/~kcho/
+    [2] https://github.com/abisee/cnn-dailymail/
+    """
+
+    def __init__(self, tokenizer, prefix="train", data_dir=""):
+        assert os.path.isdir(data_dir)
+        self.tokenizer = tokenizer
+
+        # We initialize the class by listing all the files that contain
+        # stories and summaries. Files are not read in memory given
+        # the size of the corpus.
+        self.stories_path = []
+        datasets = ("cnn", "dailymail")
+        for dataset in datasets:
+            path_to_stories = os.path.join(data_dir, dataset, "stories")
+            story_filenames_list = os.listdir(path_to_stories)
+            for story_filename in story_filenames_list:
+                path_to_story = os.path.join(path_to_stories, story_filename)
+                if not os.path.isfile(path_to_story):
+                    continue
+                self.stories_path.append(path_to_story)
+
+    def __len__(self):
+        return len(self.stories_path)
+
+    def __getitem__(self, idx):
+        story_path = self.stories_path[idx]
+        with open(story_path, encoding="utf-8") as source:
+            raw_story = source.read()
+            story_lines, summary_lines = process_story(raw_story)
+        return story_lines, summary_lines
+
+
+def process_story(raw_story):
+    """ Extract the story and summary from a story file.
+
+    Attributes:
+        raw_story (str): content of the story file as an utf-8 encoded string.
+
+    Raises:
+        IndexError: If the stoy is empty or contains no highlights.
+    """
+    nonempty_lines = list(
+        filter(lambda x: len(x) != 0, [line.strip() for line in raw_story.split("\n")])
+    )
+
+    # for some unknown reason some lines miss a period, add it
+    nonempty_lines = [_add_missing_period(line) for line in nonempty_lines]
+
+    # gather article lines
+    story_lines = []
+    lines = deque(nonempty_lines)
+    while True:
+        try:
+            element = lines.popleft()
+            if element.startswith("@highlight"):
+                break
+            story_lines.append(element)
+        except IndexError:
+            # if "@highlight" is absent from the file we pop
+            # all elements until there is None.
+            return story_lines, []
+
+    # gather summary lines
+    summary_lines = list(filter(lambda t: not t.startswith("@highlight"), lines))
+
+    return story_lines, summary_lines
+
+
+def _add_missing_period(line):
+    END_TOKENS = [".", "!", "?", "...", "'", "`", '"', u"\u2019", u"\u2019", ")"]
+    if line.startswith("@highlight"):
+        return line
+    if line[-1] in END_TOKENS:
+        return line
+    return line + "."
+
+
+# --------------------------
+# Encoding and preprocessing
+# --------------------------
+
+
+def fit_to_block_size(sequence, block_size, pad_token):
+    """ Adapt the source and target sequences' lengths to the block size.
+    If the sequence is shorter than the block size we pad it with -1 ids
+    which correspond to padding tokens.
+    """
+    if len(sequence) > block_size:
+        return sequence[:block_size]
+    else:
+        sequence.extend([pad_token] * (block_size - len(sequence)))
+        return sequence
+
+
+def build_lm_labels(sequence, pad_token):
+    """ Padding token, encoded as 0, are represented by the value -1 so they
+    are not taken into account in the loss computation. """
+    padded = sequence.clone()
+    padded[padded == pad_token] = -1
+    return padded
+
+
+def build_mask(sequence, pad_token):
+    """ Builds the mask. The attention mechanism will only attend to positions
+    with value 1. """
+    mask = sequence.clone()
+    mask[mask != pad_token] = 1
+    mask[mask == pad_token] = 0
+    return mask
+
+
+def encode_for_summarization(story_lines, summary_lines, tokenizer):
+    """ Encode the story and summary lines, and join them
+    as specified in [1] by using `[SEP] [CLS]` tokens to separate
+    sentences.
+    """
+    story_lines_token_ids = [
+        tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
+        for line in story_lines
+    ]
+    summary_lines_token_ids = [
+        tokenizer.add_special_tokens_single_sequence(tokenizer.encode(line))
+        for line in summary_lines
+    ]
+
+    story_token_ids = [
+        token for sentence in story_lines_token_ids for token in sentence
+    ]
+    summary_token_ids = [
+        token for sentence in summary_lines_token_ids for token in sentence
+    ]
+
+    return story_token_ids, summary_token_ids
+
+
+def compute_token_type_ids(batch, separator_token_id):
+    """ Segment embeddings as described in [1]
+
+    The values {0,1} were found in the repository [2].
+
+    Attributes:
+        batch: torch.Tensor, size [batch_size, block_size]
+            Batch of input.
+        separator_token_id: int
+            The value of the token that separates the segments.
+
+    [1] Liu, Yang, and Mirella Lapata. "Text summarization with pretrained encoders."
+        arXiv preprint arXiv:1908.08345 (2019).
+    [2] https://github.com/nlpyang/PreSumm (/src/prepro/data_builder.py, commit fac1217)
+    """
+    batch_embeddings = []
+    for sequence in batch:
+        sentence_num = 0
+        embeddings = []
+        for s in sequence:
+            if s == separator_token_id:
+                sentence_num += 1
+            embeddings.append(sentence_num % 2)
+        batch_embeddings.append(embeddings)
+    return torch.tensor(batch_embeddings)
diff --git a/examples/utils_summarization_test.py b/examples/utils_summarization_test.py
new file mode 100644
index 0000000000..7a02f8fa1f
--- /dev/null
+++ b/examples/utils_summarization_test.py
@@ -0,0 +1,133 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+
+import numpy as np
+import torch
+
+from utils_summarization import (
+    compute_token_type_ids,
+    fit_to_block_size,
+    build_mask,
+    build_lm_labels,
+    process_story,
+)
+
+
+class SummarizationDataProcessingTest(unittest.TestCase):
+    def setUp(self):
+        self.block_size = 10
+
+    def test_fit_to_block_sequence_too_small(self):
+        """ Pad the sequence with 0 if the sequence is smaller than the block size."""
+        sequence = [1, 2, 3, 4]
+        expected_output = [1, 2, 3, 4, 0, 0, 0, 0, 0, 0]
+        self.assertEqual(
+            fit_to_block_size(sequence, self.block_size, 0), expected_output
+        )
+
+    def test_fit_to_block_sequence_fit_exactly(self):
+        """ Do nothing if the sequence is the right size. """
+        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        self.assertEqual(
+            fit_to_block_size(sequence, self.block_size, 0), expected_output
+        )
+
+    def test_fit_to_block_sequence_too_big(self):
+        """ Truncate the sequence if it is too long. """
+        sequence = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
+        expected_output = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+        self.assertEqual(
+            fit_to_block_size(sequence, self.block_size, 0), expected_output
+        )
+
+    def test_process_story_no_highlights(self):
+        """ Processing a story with no highlights returns an empty list for the summary.
+        """
+        raw_story = """It was the year of Our Lord one thousand seven hundred and
+        seventy-five.\n\nSpiritual revelations were conceded to England at that
+        favoured period, as at this."""
+        _, summary_lines = process_story(raw_story)
+        self.assertEqual(summary_lines, [])
+
+    def test_process_empty_story(self):
+        """ An empty story returns an empty collection of lines.
+        """
+        raw_story = ""
+        story_lines, summary_lines = process_story(raw_story)
+        self.assertEqual(story_lines, [])
+        self.assertEqual(summary_lines, [])
+
+    def test_process_story_with_missing_period(self):
+        raw_story = (
+            "It was the year of Our Lord one thousand seven hundred and "
+            "seventy-five\n\nSpiritual revelations were conceded to England "
+            "at that favoured period, as at this.\n@highlight\n\nIt was the best of times"
+        )
+        story_lines, summary_lines = process_story(raw_story)
+
+        expected_story_lines = [
+            "It was the year of Our Lord one thousand seven hundred and seventy-five.",
+            "Spiritual revelations were conceded to England at that favoured period, as at this.",
+        ]
+        self.assertEqual(expected_story_lines, story_lines)
+
+        expected_summary_lines = ["It was the best of times."]
+        self.assertEqual(expected_summary_lines, summary_lines)
+
+    def test_build_lm_labels_no_padding(self):
+        sequence = torch.tensor([1, 2, 3, 4])
+        expected = sequence
+        np.testing.assert_array_equal(
+            build_lm_labels(sequence, 0).numpy(), expected.numpy()
+        )
+
+    def test_build_lm_labels(self):
+        sequence = torch.tensor([1, 2, 3, 4, 0, 0, 0])
+        expected = torch.tensor([1, 2, 3, 4, -1, -1, -1])
+        np.testing.assert_array_equal(
+            build_lm_labels(sequence, 0).numpy(), expected.numpy()
+        )
+
+    def test_build_mask_no_padding(self):
+        sequence = torch.tensor([1, 2, 3, 4])
+        expected = torch.tensor([1, 1, 1, 1])
+        np.testing.assert_array_equal(
+            build_mask(sequence, 0).numpy(), expected.numpy()
+        )
+
+    def test_build_mask(self):
+        sequence = torch.tensor([1, 2, 3, 4, 23, 23, 23])
+        expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
+        np.testing.assert_array_equal(
+            build_mask(sequence, 23).numpy(), expected.numpy()
+        )
+
+    def test_compute_token_type_ids(self):
+        separator = 101
+        batch = torch.tensor(
+            [[1, 2, 3, 4, 5, 6], [1, 2, 3, 101, 5, 6], [1, 101, 3, 4, 101, 6]]
+        )
+        expected = torch.tensor(
+            [[0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 1, 1], [0, 1, 1, 1, 0, 0]]
+        )
+
+        result = compute_token_type_ids(batch, separator)
+        np.testing.assert_array_equal(result, expected)
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/modeling_beam_search.py b/transformers/modeling_beam_search.py
index 3a27625f90..171dcb7247 100644
--- a/transformers/modeling_beam_search.py
+++ b/transformers/modeling_beam_search.py
@@ -26,189 +26,220 @@ import torch
 from torch import nn
 
 
-class ModelWithBeamSearch(nn.Module):
+class TransformerBeamSearch(nn.Module):
     def __init__(
         self,
         model,
+        tokenizer,
+        batch_size,
         beam_size,
-        start_token_id,
-        end_token_id,
-        pad_token_id,
         min_length,
         max_length,
-        alpha,
-        block_trigram=True,
+        alpha=0,
+        block_repeating_trigram=True,
     ):
         """
         Attributes:
             mask_word_id: token id that corresponds to the mask
         """
-        super(ModelWithBeamSearch, self).__init__()
+        super(TransformerBeamSearch, self).__init__()
         self.model = model
+        self.tokenizer = tokenizer
+
+        self.start_token_id = tokenizer.start_token_id
+        self.end_token_id = tokenizer.end_token_id
+        self.pad_token_id = tokenizer.pad_token_id
+
         self.beam_size = beam_size
-        self.start_token_id = start_token_id
-        self.end_token_id = end_token_id
-        self.pad_token_id = pad_token_id
         self.min_length = min_length
         self.max_length = max_length
-        self.alpha = alpha
-        self.block_trigram = block_trigram
 
-    def forward(self, input_ids, **kwargs):
-        # Separate the encoder- and decoder- specific kwargs. A kwarg is
-        # decoder-specific it the key starts with `decoder_`
+        self.block_repeating_trigram = block_repeating_trigram
+        self.apply_length_penalty = False if alpha == 0 else True
+        self.alpha = alpha
+
+        # State of the beam
+        self.hypotheses = [[] for _ in range(batch_size)]
+        self.batch_offset = torch.arange(batch_size, dtype=torch.long)
+        self.beam_offset = torch.arange(
+            0, batch_size * self.beam_size, step=self.beam_size, dtype=torch.long
+        )
+        self.growing_beam = torch.full(
+            (batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long
+        )
+        self.topk_log_probabilities = torch.tensor(
+            [0.0] + [float("-inf")] * (self.beam_size - 1), dtype=torch.float
+        ).repeat(batch_size)
+        self.results = {
+            "prediction": [[] for _ in batch_size],
+            "scores": [[] for _ in batch_size],
+        }
+        self._step = 0
+        self.is_done = False
+
+    def step(self, log_probabilities):
+        """ Grows the beam by one step. """
+        self._step += 1
+
+        # The batch size changes as some beams finish so we define _B
+        vocab_size = log_probabilities.size(-1)
+        _B = log_probabilities.size(0) // self.beam_size
+
+        # Multiply each beam probability with the probability of the
+        # next token (conditioned on the words in the beam).
+        log_probabilities += self.topk_log_probabilities.view(-1, 1)
+
+        self.enforce_min_length(log_probabilities)
+        if self.block_repeating_trigram:
+            self.remove_repeating_trigrams(log_probabilities, _B)
+
+        # Find the `beam_size` (previous_beam + token) combinations with
+        # the highest score
+        topk_log_probabilities, topk_ids = log_probabilities.topk(
+            log_probabilities.view(_B, self.beam_size * vocab_size),
+            self.beam_size,
+            dim=1,
+        )
+
+        # Apply the length penalty. The +1 accounts for the [EOS] token
+        # that will be added if the beam ends.
+        topk_scores = topk_log_probabilities / self.length_penalty()
+
+        # Retrieve the corresponding respective beam and token id
+        # topk_token_ids[i] will be added to topk_beam_ids[i]
+        topk_beam_ids = topk_ids.div(vocab_size)
+        topk_token_ids = topk_ids.fmod(vocab_size)
+
+        # Retrieve the row index of the surviving beams in the original
+        # view of the log_probabilities tensor
+        surviving_beams_rows = (topk_beam_ids + self.beam_offset[:_B].view(-1, 1)).view(
+            -1
+        )
+
+        # Append the last predictions
+        self.growing_beam = torch.cat(
+            [
+                self.growing_beam.index_select(0, surviving_beams_rows),
+                topk_token_ids.view(-1, 1),
+            ],
+            1,
+        )
+
+        # Check if any of the beam searches has ended during this
+        # growth step. Also if top beam (most probable) has ended
+        # for one element of the batch.
+        is_finished = topk_token_ids.eq(self.end_token_id)
+        self.enforce_max_length()
+        is_top_beam_finished = is_finished[:, 0].eq(1)
+
+        # Save the finished searches
+        if is_finished.any():
+            predictions = self.growing_beam.view(
+                -1, self.beam_size, self.growing_beam.size(1)
+            )
+            for i in range(is_finished.size(0)):
+                if is_top_beam_finished[i]:
+                    is_finished[i].fill_(1)
+                finished_hyp = is_finished[i].nonzero().view(-1)
+
+                # Store finished hypotheses for this batch.
+                b = self.batch_offset[i]
+                for j in finished_hyp:
+                    self.hypotheses[b].append((topk_scores[i, j], predictions[i, j, :]))
+
+                # If the batch reached the end, save the best hypotheses
+                # in terms of length-penalized score.
+                if is_top_beam_finished[i]:
+                    best_hyp = sorted(
+                        self.hypotheses[b], key=lambda x: x[0], reverse=True
+                    )
+                    best_score, best_prediction = best_hyp[0]
+                    self.results["scores"][b].append(best_score)
+                    self.results["predictions"][b].append(best_prediction)
+
+            non_finished = is_top_beam_finished.eq(0).nonzero().view(-1)
+            if len(non_finished) == 0:
+                self.is_done = True
+
+            # Remove finished batches for the next step.
+            topk_log_probabilities = topk_log_probabilities.index_select(
+                0, non_finished
+            )
+            self.batch_offset = self.batch_offset.index_select(0, non_finished)
+            self.growing_beam = predictions.index_select(0, non_finished).view(
+                -1, self.growing_beam.size(-1)
+            )
+
+            surviving_beams_rows = surviving_beams_rows.index_select(0, non_finished)
+
+        return surviving_beams_rows
+
+    def forward(self, encoder_input_ids, **kwargs):
+        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
+        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
+        # that apply to the model as whole.
+        # We let the specific kwargs override the common ones in case of conflict.
         kwargs_encoder = {
-            argument: value
+            argument[len("encoder_"):]: value
             for argument, value in kwargs.items()
-            if not argument.startswith("decoder_")
+            if argument.startswith("encoder_")
         }
         kwargs_decoder = {
             argument[len("decoder_"):]: value
             for argument, value in kwargs.items()
             if argument.startswith("decoder_")
         }
+        kwargs_common = {
+            argument: value
+            for argument, value in kwargs.items()
+            if not (argument.startswith("encoder_") or argument.startswith("decoder_"))
+        }
+        kwargs_decoder = dict(kwargs_common, **kwargs_decoder)
+        kwargs_encoder = dict(kwargs_common, **kwargs_encoder)
 
-        batch_size, _ = input_ids.size(0)
-
-        # Variables that keep track of the status of the search
-        hypotheses = [[] for _ in range(batch_size)]
-        batch_offset = torch.arange(batch_size, dtype=torch.long)
-        beam_offset = torch.arange(
-            0,
-            batch_size * self.beam_size,
-            step=self.beam_size,
-            dtype=torch.long,
-        )
-        growing_beam = torch.full(
-            (batch_size * self.beam_size, 1),
-            self.start_token_id,
-            dtype=torch.long,
-        )
-        topk_log_probabilities = torch.tensor(
-            [0.0] + [float("-inf")] * (self.beam_size - 1),
-            dtype=torch.float,
-        ).repeat(batch_size)
-
-        # Forward pass on the encoder
-        encoder_outputs = self.encoder(input_ids, kwargs_encoder)
+        # forward pass on the encoder
+        encoder_outputs = self.model.encoder.forward(encoder_input_ids, kwargs_encoder)
         kwargs_decoder["encoder_hidden_states"] = tile(
             encoder_outputs, self.beam_size, dim=0
         )
 
-        results = {}
-        results["predictions"] = [[] for _ in batch_size]
-        results["scores"] = [[] for _ in batch_size]
-
+        # grow the beam by generating sequences in an autoregressive way
+        self.growing_beam = torch.full(
+            (self.batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long
+        )
         for step in range(self.max_length):
-            decoder_input = growing_beam[:, -1]
-            outputs = self.decoder(decoder_input, kwargs_decoder)
+            decoder_input = self.growing_beam[:, -1]
+            outputs = self.model.decoder(decoder_input, kwargs_decoder)
             log_probabilities = torch.nn.functional.log_softmax(outputs[1])
-            vocab_size = log_probabilities.size(-1)
+            surviving_beams_rows = self.step(log_probabilities)
+            if self.is_done:
+                break
 
-            # The batch size changes as some beams finish so we define:
-            _B = log_probabilities.size(0) // self.beam_size
-
-            # Multiply each beam probability with the probability of the
-            # next token (conditioned on the words in the beam).
-            log_probabilities += topk_log_probabilities.view(-1, 1)
-
-            # if the beam has not attained the minimum required length we
-            # make the end token arbitrarily unlikely.
-            if step < self.min_length:
-                log_probabilities[self.end_token_id] = -1e20
-
-            # Remove repeating tri-grams
-            if(self.args.block_trigram):
-                if(step + 1 > 3):
-                    for i in range(_B * self.beam_size):
-                        tokens = [t for t in growing_beam[i]]
-                        trigrams = [(tokens[i-1], tokens[i], tokens[i+1]) for i in range(1, len(words) - 1)]
-                        last_trigram = tuple(trigrams[-1])
-                        if last_trigram in trigrams[:-1]:
-                            log_probabilities[i] = -1e20
-
-            # Find the `beam_size` (previous_beam + token) combinations with
-            # the highest score
-            topk_log_probabilities, topk_ids = log_probabilities.topk(
-                log_probabilities.view(_B, self.beam_size * vocab_size),
-                self.beam_size,
-                dim=1
-            )
-
-            # Apply the length penalty. The +1 accounts for the [EOS] token
-            # that will be added if the beam ends.
-            length_penalty = ((5.0 + (step + 1)) / 6.0) ** self.alpha
-            topk_scores = topk_log_probabilities / length_penalty
-
-            # Retrieve the corresponding respective beam and token id
-            # topk_token_ids[i] will be added to topk_beam_ids[i]
-            topk_beam_ids = topk_ids.div(vocab_size)
-            topk_token_ids = topk_ids.fmod(vocab_size)
-
-            # Retrieve the row index of the surviving beams in the original
-            # view of the log_probabilities tensor
-            surviving_beams_rows = (
-                topk_beam_ids + beam_offset[:_B].view(-1, 1)
-            ).view(-1)
-
-            # Append the last predictions
-            growing_beam = torch.cat(
-                [
-                    growing_beam.index_select(0, surviving_beams_rows),
-                    topk_token_ids.view(-1, 1),
-                ],
-                1,
-            )
-
-            # Check if any of the beam searches has ended during this
-            # growth step. Also if top beam (most probable) has ended
-            # for one element of the batch.
-            is_finished = topk_token_ids.eq(self.end_token_id)
-            if step + 1 == self.max_length:
-                is_finished.fill_(1)
-            is_top_beam_finished = is_finished[:, 0].eq(1)
-
-            # Save the finished searches
-            if is_finished.any():
-                predictions = growing_beam.view(-1, self.beam_size, growing_beam.size(1))
-                for i in range(is_finished.size(0)):
-                    if is_top_beam_finished[i]:
-                        is_finished[i].fill_(1)
-                    finished_hyp = is_finished[i].nonzero().view(-1)
-
-                    # Store finished hypotheses for this batch.
-                    b = batch_offset[i]
-                    for j in finished_hyp:
-                        hypotheses[b].append((topk_scores[i, j], predictions[i, j, :]))
-
-                    # If the batch reached the end, save the best hypotheses
-                    # in terms of length-penalized score.
-                    if is_top_beam_finished[i]:
-                        best_hyp = sorted(
-                            hypotheses[b], key=lambda x: x[0], reverse=True
-                        )
-                        best_score, best_prediction = best_hyp[0]
-                        results["scores"][b].append(best_score)
-                        results["predictions"][b].append(best_prediction)
-
-                non_finished = is_top_beam_finished.eq(0).nonzero().view(-1)
-                if len(non_finished) == 0:
-                    break
-
-                # Remove finished batches for the next step.
-                topk_log_probabilities = topk_log_probabilities.index_select(0, non_finished)
-                batch_offset = batch_offset.index_select(0, non_finished)
-                growing_beam = predictions.index_select(0, non_finished).view(
-                    -1, growing_beam.size(-1)
-                )
-
-            # Re-order the state for the next pass
-            surviving_beams_rows = surviving_beams_rows.index_select(0, non_finished)
             kwargs_decoder["encoder_hidden_states"] = kwargs_decoder[
                 "encoder_hidden_states"
             ].index_select(0, surviving_beams_rows)
 
-        return results
+        return self.results
+
+    def remove_repeating_trigrams(self, log_probabilities, _B):
+        if(self._step + 1 > 3):
+            for i in range(_B * self.beam_size):
+                tokens = [t for t in self.growing_beam[i]]
+                trigrams = [(tokens[i-1], tokens[i], tokens[i+1]) for i in range(1, len(words) - 1)]
+                last_trigram = tuple(trigrams[-1])
+                if last_trigram in trigrams[:-1]:
+                    log_probabilities[i] = -1e20
+
+    def enforce_min_length(self):
+        if self._step < self.min_length:
+            self.log_probabilities[self.end_token_id] = -1e20
+
+    def enforce_max_length(self):
+        if self._step + 1 == self.max_length:
+            self.is_finished.fill_(1)
+
+    def length_penalty(self):
+        return ((5.0 + (self._step + 1)) / 6.0) ** self.alpha
 
 
 def tile(x, count, dim=0):
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 93f3c7e1f1..1081c8dd7b 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -632,6 +632,8 @@ class BertModel(BertPreTrainedModel):
         """
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
+        if encoder_attention_mask is None:
+            encoder_attention_mask = torch.ones_like(input_ids)
         if token_type_ids is None:
             token_type_ids = torch.zeros_like(input_ids)
 
@@ -660,12 +662,15 @@ class BertModel(BertPreTrainedModel):
         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
-        # If a 2D encoder attention mask is provided for the cross-attention
+        # If a 2D ou 3D attention mask is provided for the cross-attention
         # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
-        if encoder_attention_mask is not None:
-            encoder_attention_mask = encoder_attention_mask[:, None, None, :]
-            encoder_attention_mask = encoder_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-            encoder_attention_mask = (1.0 - encoder_attention_mask) * -10000.0
+        if encoder_attention_mask.dim() == 3:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+        if encoder_attention_mask.dim() == 2:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
 
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
@@ -687,7 +692,7 @@ class BertModel(BertPreTrainedModel):
                                        attention_mask=extended_attention_mask,
                                        head_mask=head_mask,
                                        encoder_hidden_states=encoder_hidden_states,
-                                       encoder_attention_mask=encoder_attention_mask)
+                                       encoder_attention_mask=encoder_extended_attention_mask)
         sequence_output = encoder_outputs[0]
         pooled_output = self.pooler(sequence_output)
 
@@ -788,8 +793,10 @@ class BertForMaskedLM(BertPreTrainedModel):
             in ``[0, ..., config.vocab_size]``
 
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+        **masked_lm_loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
             Masked language modeling loss.
+        **next_token_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Next token prediction loss.
         **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
@@ -854,13 +861,13 @@ class BertForMaskedLM(BertPreTrainedModel):
 
         if lm_labels is not None:
             # we are doing next-token prediction; shift prediction scores and input ids by one
-            prediction_scores = prediction_scores[:, :-1, :]
-            lm_labels = lm_labels[:, 1:]
+            prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            lm_labels = lm_labels[:, 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            seq2seq_loss = loss_fct(prediction_scores.reshape(-1, self.config.vocab_size), lm_labels.reshape(-1))
-            outputs = (seq2seq_loss,) + outputs
+            next_token_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1))
+            outputs = (next_token_loss,) + outputs
 
-        return outputs  # (mlm_or_seq2seq_loss), prediction_scores, (hidden_states), (attentions)
+        return outputs  # (masked_lm_loss), (next_token_loss), prediction_scores, (hidden_states), (attentions)
 
 
 @add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,
diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
index 2767dd2cd1..22898db9a1 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_seq2seq.py
@@ -29,7 +29,7 @@ logger = logging.getLogger(__name__)
 
 class PreTrainedSeq2seq(nn.Module):
     r"""
-        :class:`~transformers.Seq2seq` is a generic model class that will be
+        :class:`~transformers.PreTrainedSeq2seq` is a generic model class that will be
         instantiated as a Seq2seq model with one of the base model classes of
         the library as encoder and (optionally) as decoder when created with
         the `AutoModel.from_pretrained(pretrained_model_name_or_path)` class
@@ -49,8 +49,7 @@ class PreTrainedSeq2seq(nn.Module):
         *model_args,
         **kwargs
     ):
-        r""" Instantiates an encoder and a decoder from one or two base classes
-        of the library from pre-trained model checkpoints.
+        r""" Instantiates an encoder and a decoder from one or two base classes of the library from pre-trained model checkpoints.
 
 
         The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
@@ -111,35 +110,44 @@ class PreTrainedSeq2seq(nn.Module):
             model = PreTrainedSeq2seq.from_pretained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
         """
 
-        # Separate the encoder- and decoder- specific kwargs. A kwarg is
-        # decoder-specific it the key starts with `decoder_`
+        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
+        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
+        # that apply to the model as a whole.
+        # We let the specific kwargs override the common ones in case of conflict.
         kwargs_encoder = {
-            argument: value
+            argument[len("encoder_"):]: value
             for argument, value in kwargs.items()
-            if not argument.startswith("decoder_")
+            if argument.startswith("encoder_")
         }
         kwargs_decoder = {
-            argument[len("decoder_") :]: value
+            argument[len("decoder_"):]: value
             for argument, value in kwargs.items()
             if argument.startswith("decoder_")
         }
+        kwargs_common = {
+            argument: value
+            for argument, value in kwargs.items()
+            if not (argument.startswith("encoder_") or argument.startswith("decoder_"))
+        }
+        kwargs_decoder = dict(kwargs_common, **kwargs_decoder)
+        kwargs_encoder = dict(kwargs_common, **kwargs_encoder)
 
         # Load and initialize the encoder and decoder
         # The distinction between encoder and decoder at the model level is made
         # by the value of the flag `is_decoder` that we need to set correctly.
-        encoder = kwargs_encoder.pop("encoder_model", None)
+        encoder = kwargs_encoder.pop("model", None)
         if encoder is None:
-            kwargs_encoder["is_decoder"] = False
             encoder = AutoModel.from_pretrained(
                 encoder_pretrained_model_name_or_path, *model_args, **kwargs_encoder
             )
+        encoder.config.is_decoder = False
 
         decoder = kwargs_decoder.pop("model", None)
         if decoder is None:
-            kwargs_decoder["is_decoder"] = True
             decoder = AutoModelWithLMHead.from_pretrained(
                 decoder_pretrained_model_name_or_path, **kwargs_decoder
             )
+        decoder.config.is_decoder = True
 
         model = cls(encoder, decoder)
 
@@ -169,37 +177,60 @@ class PreTrainedSeq2seq(nn.Module):
             decoder_input_ids: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``
                 Indices of decoder input sequence tokens in the vocabulary.
         """
-        # Separate the encoder- and decoder- specific kwargs. A kwarg is
-        # decoder-specific it the key starts with `decoder_`
+        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
+        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
+        # that apply to the model as whole.
+        # We let the specific kwargs override the common ones in case of conflict.
         kwargs_encoder = {
-            argument: value
+            argument[len("encoder_"):]: value
             for argument, value in kwargs.items()
-            if not argument.startswith("decoder_")
+            if argument.startswith("encoder_")
         }
         kwargs_decoder = {
-            argument[len("decoder_") :]: value
+            argument[len("decoder_"):]: value
             for argument, value in kwargs.items()
             if argument.startswith("decoder_")
         }
+        kwargs_common = {
+            argument: value
+            for argument, value in kwargs.items()
+            if not (argument.startswith("encoder_") or argument.startswith("decoder_"))
+        }
+        kwargs_decoder = dict(kwargs_common, **kwargs_decoder)
+        kwargs_encoder = dict(kwargs_common, **kwargs_encoder)
 
         # Encode if needed (training, first prediction pass)
-        encoder_hidden_states = kwargs_encoder.pop("encoder_hidden_states", None)
+        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
         if encoder_hidden_states is None:
             encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
-            encoder_hidden_states = encoder_outputs[0][
-                -1
-            ]  # output of the encoder *stack*
+            encoder_hidden_states = encoder_outputs[0]  # output the last layer hidden state
         else:
             encoder_outputs = ()
 
         # Decode
-        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states[None, :, :]
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
+        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
         decoder_outputs = self.decoder(decoder_input_ids, **kwargs_decoder)
 
         return decoder_outputs + encoder_outputs
 
 
 class Model2Model(PreTrainedSeq2seq):
+    r"""
+        :class:`~transformers.Model2Model` instantiates a Seq2Seq2 model
+        where both of the encoder and decoder are of the same family. If the
+        name of or that path to a pretrained model is specified the encoder and
+        the decoder will be initialized with the pretrained weight (the
+        cross-attention will be intialized randomly if its weights are not
+        present).
+
+        It is possible to override this behavior and initialize, say, the decoder randomly
+        by creating it beforehand as follows
+
+            config = BertConfig.from_pretrained()
+            decoder = BertForMaskedLM(config)
+            model = Model2Model.from_pretrained('bert-base-uncased', decoder_model=decoder)
+    """
     def __init__(self, *args, **kwargs):
         super(Model2Model, self).__init__(*args, **kwargs)
         self.tie_weights()
@@ -235,14 +266,10 @@ class Model2Model(PreTrainedSeq2seq):
         model = super(Model2Model, cls).from_pretrained(
             encoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
             decoder_pretrained_model_name_or_path=pretrained_model_name_or_path,
+            *args,
             **kwargs
         )
 
-        # Some architectures require for the decoder to be initialized randomly
-        # before fine-tuning.
-        if kwargs.get("decoder_initialize_randomly", False):
-            model.decoder.init_weights()
-
         return model
 
 

From 098a89f312311a730275a79af7cf5c527d35fdd7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Tue, 29 Oct 2019 20:08:03 +0100
Subject: [PATCH 126/144] update docstrings; rename lm_labels to more explicit
 ltr_lm_labels

---
 examples/run_summarization_finetuning.py |  8 ++--
 transformers/modeling_bert.py            | 51 +++++++++++++-----------
 2 files changed, 32 insertions(+), 27 deletions(-)

diff --git a/examples/run_summarization_finetuning.py b/examples/run_summarization_finetuning.py
index 1888f56caf..2dc8c660ce 100644
--- a/examples/run_summarization_finetuning.py
+++ b/examples/run_summarization_finetuning.py
@@ -26,7 +26,7 @@ import numpy as np
 from tqdm import tqdm, trange
 import torch
 from torch.optim import Adam
-from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
+from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
 
 from transformers import (
     AutoTokenizer,
@@ -283,14 +283,14 @@ def evaluate(args, model, tokenizer, prefix=""):
     model.eval()
 
     for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
+        source, target, encoder_token_type_ids, encoder_mask, decoder_mask, ltr_lm_labels = batch
 
         source = source.to(args.device)
         target = target.to(args.device)
         encoder_token_type_ids = encoder_token_type_ids.to(args.device)
         encoder_mask = encoder_mask.to(args.device)
         decoder_mask = decoder_mask.to(args.device)
-        lm_labels = lm_labels.to(args.device)
+        ltr_lm_labels = ltr_lm_labels.to(args.device)
 
         with torch.no_grad():
             outputs = model(
@@ -299,7 +299,7 @@ def evaluate(args, model, tokenizer, prefix=""):
                 encoder_token_type_ids=encoder_token_type_ids,
                 encoder_attention_mask=encoder_mask,
                 decoder_attention_mask=decoder_mask,
-                decoder_lm_labels=lm_labels,
+                decoder_ltr_lm_labels=ltr_lm_labels,
             )
             lm_loss = outputs[0]
             eval_loss += lm_loss.mean().item()
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 1081c8dd7b..3fec69a814 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -548,6 +548,14 @@ BERT_INPUTS_DOCSTRING = r"""
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **encoder_hidden_states**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``:
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model
+            is configured as a decoder.
+        **encoder_attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask
+            is used in the cross-attention if the model is configured as a decoder.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
 """
 
 @add_start_docstrings("The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
@@ -609,26 +617,18 @@ class BertModel(BertPreTrainedModel):
                 head_mask=None, encoder_hidden_states=None, encoder_attention_mask=None):
         """ Forward pass on the Model.
 
-        The values of the attention matrix (shape [batch_size, seq_length])
-        should be 1.0 for the position we want to attend to and 0. for the ones
-        we do not want to attend to.
-
         The model can behave as an encoder (with only self-attention) as well
         as a decoder, in which case a layer of cross-attention is added between
-        ever self-attention layer, following the architecture described in [1].
+        the self-attention layers, following the architecture described in `Attention is all you need`_ by Ashish Vaswani,
+        Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
 
-        To behave like as a decoder the model needs to be initialized with the
-        `is_decoder` argument of the config set to `True`. An
+        To behave as an decoder the model needs to be initialized with the
+        `is_decoder` argument of the configuration set to `True`; an
         `encoder_hidden_states` is expected as an input to the forward pass.
-        When a decoder, there are two kinds of attention masks to specify:
 
-        (1) Self-attention masks that need to be causal (only attends to
-        previous tokens);
-        (2) A cross-attention mask that prevents the module
-        from attending to the encoder's padding tokens.
+        .. _`Attention is all you need`:
+            https://arxiv.org/abs/1706.03762
 
-        [1] Vaswani, Ashish, et al. "Attention is all you need." Advances in
-        neural information processing systems. 2017.
         """
         if attention_mask is None:
             attention_mask = torch.ones_like(input_ids)
@@ -791,11 +791,16 @@ class BertForMaskedLM(BertPreTrainedModel):
             Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
             Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
             in ``[0, ..., config.vocab_size]``
+        **ltr_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the left-to-right language modeling loss (next word prediction).
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
 
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **masked_lm_loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
             Masked language modeling loss.
-        **next_token_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+        **ltr_lm_loss**: (`optional`, returned when ``ltr_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
             Next token prediction loss.
         **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -833,7 +838,7 @@ class BertForMaskedLM(BertPreTrainedModel):
                                    self.bert.embeddings.word_embeddings)
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ):
+                masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, ltr_lm_labels=None, ):
 
         outputs = self.bert(input_ids,
                             attention_mask=attention_mask,
@@ -852,22 +857,22 @@ class BertForMaskedLM(BertPreTrainedModel):
         # 1. If a tensor that contains the indices of masked labels is provided,
         #    the cross-entropy is the MLM cross-entropy that measures the likelihood
         #    of predictions for masked words.
-        # 2. If `lm_label` is provided we are in a causal scenario where we
-        #    try to predict the next word for each input in the encoder.
+        # 2. If `ltr_lm_labels` is provided we are in a causal scenario where we
+        #    try to predict the next token for each input in the decoder.
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)  # -1 index = padding token
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
             outputs = (masked_lm_loss,) + outputs
 
-        if lm_labels is not None:
+        if ltr_lm_labels is not None:
             # we are doing next-token prediction; shift prediction scores and input ids by one
             prediction_scores = prediction_scores[:, :-1, :].contiguous()
-            lm_labels = lm_labels[:, 1:].contiguous()
+            ltr_lm_labels = ltr_lm_labels[:, 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            next_token_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1))
-            outputs = (next_token_loss,) + outputs
+            ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), ltr_lm_labels.view(-1))
+            outputs = (ltr_lm_loss,) + outputs
 
-        return outputs  # (masked_lm_loss), (next_token_loss), prediction_scores, (hidden_states), (attentions)
+        return outputs  # (masked_lm_loss), (ltr_lm_loss), prediction_scores, (hidden_states), (attentions)
 
 
 @add_start_docstrings("""Bert Model with a `next sentence prediction (classification)` head on top. """,

From 842f3bf049d4a728cb4bff543e8bbd74020af230 Mon Sep 17 00:00:00 2001
From: Timothy Liu <tlkh.xms@gmail.com>
Date: Wed, 30 Oct 2019 01:32:15 +0000
Subject: [PATCH 127/144] Fixed training for TF XLM

---
 transformers/modeling_tf_xlm.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py
index 84de1517ee..9ac5d28e1f 100644
--- a/transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -84,7 +84,8 @@ def get_masks(slen, lengths, causal, padding_mask=None, dtype=tf.float32):
         attn_mask = mask
 
     # sanity check
-    assert shape_list(mask) == [bs, slen]
+    # assert shape_list(mask) == [bs, slen]
+    tf.debugging.assert_equal(shape_list(mask), [bs, slen])
     assert causal is False or shape_list(attn_mask) == [bs, slen, slen]
 
     mask = tf.cast(mask, dtype=dtype)
@@ -318,7 +319,8 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
 
         # check inputs
         bs, slen = shape_list(input_ids)
-        assert shape_list(lengths)[0] == bs
+        # assert shape_list(lengths)[0] == bs
+        tf.debugging.assert_equal(shape_list(lengths)[0], bs)
         # assert lengths.max().item() <= slen
         # input_ids = input_ids.transpose(0, 1)  # batch size as dimension 0
         # assert (src_enc is None) == (src_len is None)
@@ -335,12 +337,14 @@ class TFXLMMainLayer(tf.keras.layers.Layer):
         if position_ids is None:
             position_ids = tf.expand_dims(tf.range(slen), axis=0)
         else:
-            assert shape_list(position_ids) == [bs, slen]  # (slen, bs)
+            # assert shape_list(position_ids) == [bs, slen]  # (slen, bs)
+            tf.debugging.assert_equal(shape_list(position_ids), [bs, slen])
             # position_ids = position_ids.transpose(0, 1)
 
         # langs
         if langs is not None:
-            assert shape_list(langs) == [bs, slen]  # (slen, bs)
+            # assert shape_list(langs) == [bs, slen]  # (slen, bs)
+            tf.debugging.assert_equal(shape_list(langs), [bs, slen])
             # langs = langs.transpose(0, 1)
 
         # Prepare head mask if needed

From 9c1bdb5b61303bbdfbc3b9759f5c5fa847cb377d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Wed, 30 Oct 2019 10:43:13 +0100
Subject: [PATCH 128/144] revert renaming of lm_labels to ltr_lm_labels

---
 examples/run_summarization_finetuning.py |  6 +++---
 transformers/modeling_bert.py            | 14 +++++++-------
 transformers/modeling_seq2seq.py         | 22 +++++++++++++---------
 3 files changed, 23 insertions(+), 19 deletions(-)

diff --git a/examples/run_summarization_finetuning.py b/examples/run_summarization_finetuning.py
index 2dc8c660ce..3d194950c7 100644
--- a/examples/run_summarization_finetuning.py
+++ b/examples/run_summarization_finetuning.py
@@ -283,14 +283,14 @@ def evaluate(args, model, tokenizer, prefix=""):
     model.eval()
 
     for batch in tqdm(eval_dataloader, desc="Evaluating"):
-        source, target, encoder_token_type_ids, encoder_mask, decoder_mask, ltr_lm_labels = batch
+        source, target, encoder_token_type_ids, encoder_mask, decoder_mask, lm_labels = batch
 
         source = source.to(args.device)
         target = target.to(args.device)
         encoder_token_type_ids = encoder_token_type_ids.to(args.device)
         encoder_mask = encoder_mask.to(args.device)
         decoder_mask = decoder_mask.to(args.device)
-        ltr_lm_labels = ltr_lm_labels.to(args.device)
+        lm_labels = lm_labels.to(args.device)
 
         with torch.no_grad():
             outputs = model(
@@ -299,7 +299,7 @@ def evaluate(args, model, tokenizer, prefix=""):
                 encoder_token_type_ids=encoder_token_type_ids,
                 encoder_attention_mask=encoder_mask,
                 decoder_attention_mask=decoder_mask,
-                decoder_ltr_lm_labels=ltr_lm_labels,
+                decoder_lm_labels=lm_labels,
             )
             lm_loss = outputs[0]
             eval_loss += lm_loss.mean().item()
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 3fec69a814..11fcdde685 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -791,7 +791,7 @@ class BertForMaskedLM(BertPreTrainedModel):
             Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
             Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
             in ``[0, ..., config.vocab_size]``
-        **ltr_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the left-to-right language modeling loss (next word prediction).
             Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
             Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
@@ -800,7 +800,7 @@ class BertForMaskedLM(BertPreTrainedModel):
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **masked_lm_loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
             Masked language modeling loss.
-        **ltr_lm_loss**: (`optional`, returned when ``ltr_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+        **ltr_lm_loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
             Next token prediction loss.
         **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
             Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
@@ -838,7 +838,7 @@ class BertForMaskedLM(BertPreTrainedModel):
                                    self.bert.embeddings.word_embeddings)
 
     def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, ltr_lm_labels=None, ):
+                masked_lm_labels=None, encoder_hidden_states=None, encoder_attention_mask=None, lm_labels=None, ):
 
         outputs = self.bert(input_ids,
                             attention_mask=attention_mask,
@@ -857,19 +857,19 @@ class BertForMaskedLM(BertPreTrainedModel):
         # 1. If a tensor that contains the indices of masked labels is provided,
         #    the cross-entropy is the MLM cross-entropy that measures the likelihood
         #    of predictions for masked words.
-        # 2. If `ltr_lm_labels` is provided we are in a causal scenario where we
+        # 2. If `lm_labels` is provided we are in a causal scenario where we
         #    try to predict the next token for each input in the decoder.
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)  # -1 index = padding token
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
             outputs = (masked_lm_loss,) + outputs
 
-        if ltr_lm_labels is not None:
+        if lm_labels is not None:
             # we are doing next-token prediction; shift prediction scores and input ids by one
             prediction_scores = prediction_scores[:, :-1, :].contiguous()
-            ltr_lm_labels = ltr_lm_labels[:, 1:].contiguous()
+            lm_labels = lm_labels[:, 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
-            ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), ltr_lm_labels.view(-1))
+            ltr_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), lm_labels.view(-1))
             outputs = (ltr_lm_loss,) + outputs
 
         return outputs  # (masked_lm_loss), (ltr_lm_loss), prediction_scores, (hidden_states), (attentions)
diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_seq2seq.py
index 22898db9a1..ba8c546a30 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_seq2seq.py
@@ -30,10 +30,10 @@ logger = logging.getLogger(__name__)
 class PreTrainedSeq2seq(nn.Module):
     r"""
         :class:`~transformers.PreTrainedSeq2seq` is a generic model class that will be
-        instantiated as a Seq2seq model with one of the base model classes of
-        the library as encoder and (optionally) as decoder when created with
-        the `AutoModel.from_pretrained(pretrained_model_name_or_path)` class
-        method.
+        instantiated as a transformer architecture with one of the base model
+        classes of the library as encoder and (optionally) another one as
+        decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
+        class method.
     """
 
     def __init__(self, encoder, decoder):
@@ -59,13 +59,13 @@ class PreTrainedSeq2seq(nn.Module):
             encoder_pretrained_model_name_or_path: information necessary to initiate the encoder. Either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             decoder_pretrained_model_name_or_path: information necessary to initiate the decoder. Either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
+                - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             model_args: (`optional`) Sequence of positional arguments:
@@ -103,7 +103,7 @@ class PreTrainedSeq2seq(nn.Module):
                 - If a configuration is provided with ``config``, ``**kwargs`` will be directly passed to the underlying model's ``__init__`` method (we assume all relevant updates to the configuration have already been done)
                 - If a configuration is not provided, ``kwargs`` will be first passed to the configuration class initialization function (:func:`~transformers.PretrainedConfig.from_pretrained`). Each key of ``kwargs`` that corresponds to a configuration attribute will be used to override said attribute with the supplied ``kwargs`` value. Remaining keys that do not correspond to any configuration attribute will be passed to the underlying model's ``__init__`` function.
 
-                You can specify different kwargs for the decoder by prefixing the key with `decoder_` (e.g. ``decoder_output_attention=True``).
+                You can specify kwargs sepcific for the encoder and decoder by prefixing the key with `encoder_` and `decoder_` respectively. (e.g. ``decoder_output_attention=True``). The remaining kwargs will be passed to both encoders and decoders.
 
         Examples::
 
@@ -154,8 +154,11 @@ class PreTrainedSeq2seq(nn.Module):
         return model
 
     def save_pretrained(self, save_directory):
-        """ Save a Seq2Seq model and its configuration file in a format
-        such that it can be loaded using `:func:`~transformers.PreTrainedSeq2seq.from_pretrained` """
+        """ Save a Seq2Seq model and its configuration file in a format such
+        that it can be loaded using `:func:`~transformers.PreTrainedSeq2seq.from_pretrained`
+
+        We save the encoder' and decoder's parameters in two separate directories.
+        """
         self.encoder.save_pretrained(os.path.join(save_directory, "encoder"))
         self.decoder.save_pretrained(os.path.join(save_directory, "decoder"))
 
@@ -176,6 +179,7 @@ class PreTrainedSeq2seq(nn.Module):
                 Indices of encoder input sequence tokens in the vocabulary.
             decoder_input_ids: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``
                 Indices of decoder input sequence tokens in the vocabulary.
+            kwargs: (`optional`) Remaining dictionary of keyword arguments.
         """
         # keyword arguments come in 3 flavors: encoder-specific (prefixed by
         # `encoder_`), decoder-specific (prefixed by `decoder_`) and those

From 3b0d2fa30eb9756c888b4ed36213350d4b6e70e3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Wed, 30 Oct 2019 10:54:46 +0100
Subject: [PATCH 129/144] rename seq2seq to encoder_decoder

---
 examples/README.md                             |  6 ++----
 examples/run_summarization_finetuning.py       |  4 ++--
 transformers/__init__.py                       |  2 +-
 ..._seq2seq.py => modeling_encoder_decoder.py} | 18 +++++++++---------
 4 files changed, 14 insertions(+), 16 deletions(-)
 rename transformers/{modeling_seq2seq.py => modeling_encoder_decoder.py} (96%)

diff --git a/examples/README.md b/examples/README.md
index bec6d57171..6d27a0c560 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -10,7 +10,7 @@ similar API between the different models.
 | [GLUE](#glue) | Examples running BERT/XLM/XLNet/RoBERTa on the 9 GLUE tasks. Examples feature distributed training as well as half-precision.                              |
 | [SQuAD](#squad) | Using BERT for question answering, examples with distributed training.                                                                                  |
 | [Multiple Choice](#multiple choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
-| [Seq2seq Model fine-tuning](#seq2seq-model-fine-tuning) | Fine-tuning the library models for seq2seq tasks on the CNN/Daily Mail dataset. |
+| [Abstractive summarization](#abstractive-summarization) | Fine-tuning the library models for abstractive summarization tasks on the CNN/Daily Mail dataset. |
 
 ## Language model fine-tuning
 
@@ -391,7 +391,7 @@ exact_match = 86.91
 This fine-tuned model is available as a checkpoint under the reference
 `bert-large-uncased-whole-word-masking-finetuned-squad`.
 
-## Seq2seq model fine-tuning
+## Abstractive summarization
 
 Based on the script
 [`run_summarization_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_summarization_finetuning.py).
@@ -408,8 +408,6 @@ note that the finetuning script **will not work** if you do not download both
 datasets. We will refer as `$DATA_PATH` the path to where you uncompressed both
 archive.
 
-## Bert2Bert and abstractive summarization
-
 ```bash
 export DATA_PATH=/path/to/dataset/
 
diff --git a/examples/run_summarization_finetuning.py b/examples/run_summarization_finetuning.py
index 3d194950c7..448505c727 100644
--- a/examples/run_summarization_finetuning.py
+++ b/examples/run_summarization_finetuning.py
@@ -32,7 +32,7 @@ from transformers import (
     AutoTokenizer,
     BertForMaskedLM,
     BertConfig,
-    PreTrainedSeq2seq,
+    PreTrainedEncoderDecoder,
     Model2Model,
 )
 
@@ -475,7 +475,7 @@ def main():
         for checkpoint in checkpoints:
             encoder_checkpoint = os.path.join(checkpoint, "encoder")
             decoder_checkpoint = os.path.join(checkpoint, "decoder")
-            model = PreTrainedSeq2seq.from_pretrained(
+            model = PreTrainedEncoderDecoder.from_pretrained(
                 encoder_checkpoint, decoder_checkpoint
             )
             model.to(args.device)
diff --git a/transformers/__init__.py b/transformers/__init__.py
index 2206a0302e..844aa22295 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -87,7 +87,7 @@ if is_torch_available():
     from .modeling_distilbert import (DistilBertForMaskedLM, DistilBertModel,
                                 DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
                                 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
-    from .modeling_seq2seq import PreTrainedSeq2seq, Model2Model
+    from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
 
     # Optimization
     from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
diff --git a/transformers/modeling_seq2seq.py b/transformers/modeling_encoder_decoder.py
similarity index 96%
rename from transformers/modeling_seq2seq.py
rename to transformers/modeling_encoder_decoder.py
index ba8c546a30..162e2f8b3b 100644
--- a/transformers/modeling_seq2seq.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Auto Model class. """
+""" Classes to support Encoder-Decoder architectures """
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
@@ -27,9 +27,9 @@ from .modeling_auto import AutoModel, AutoModelWithLMHead
 logger = logging.getLogger(__name__)
 
 
-class PreTrainedSeq2seq(nn.Module):
+class PreTrainedEncoderDecoder(nn.Module):
     r"""
-        :class:`~transformers.PreTrainedSeq2seq` is a generic model class that will be
+        :class:`~transformers.PreTrainedEncoderDecoder` is a generic model class that will be
         instantiated as a transformer architecture with one of the base model
         classes of the library as encoder and (optionally) another one as
         decoder when created with the `AutoModel.from_pretrained(pretrained_model_name_or_path)`
@@ -37,7 +37,7 @@ class PreTrainedSeq2seq(nn.Module):
     """
 
     def __init__(self, encoder, decoder):
-        super(PreTrainedSeq2seq, self).__init__()
+        super(PreTrainedEncoderDecoder, self).__init__()
         self.encoder = encoder
         self.decoder = decoder
 
@@ -107,7 +107,7 @@ class PreTrainedSeq2seq(nn.Module):
 
         Examples::
 
-            model = PreTrainedSeq2seq.from_pretained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
+            model = PreTrainedEncoderDecoder.from_pretained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert
         """
 
         # keyword arguments come in 3 flavors: encoder-specific (prefixed by
@@ -155,7 +155,7 @@ class PreTrainedSeq2seq(nn.Module):
 
     def save_pretrained(self, save_directory):
         """ Save a Seq2Seq model and its configuration file in a format such
-        that it can be loaded using `:func:`~transformers.PreTrainedSeq2seq.from_pretrained`
+        that it can be loaded using `:func:`~transformers.PreTrainedEncoderDecoder.from_pretrained`
 
         We save the encoder' and decoder's parameters in two separate directories.
         """
@@ -219,7 +219,7 @@ class PreTrainedSeq2seq(nn.Module):
         return decoder_outputs + encoder_outputs
 
 
-class Model2Model(PreTrainedSeq2seq):
+class Model2Model(PreTrainedEncoderDecoder):
     r"""
         :class:`~transformers.Model2Model` instantiates a Seq2Seq2 model
         where both of the encoder and decoder are of the same family. If the
@@ -277,14 +277,14 @@ class Model2Model(PreTrainedSeq2seq):
         return model
 
 
-class Model2LSTM(PreTrainedSeq2seq):
+class Model2LSTM(PreTrainedEncoderDecoder):
     @classmethod
     def from_pretrained(cls, *args, **kwargs):
         if kwargs.get("decoder_model", None) is None:
             # We will create a randomly initilized LSTM model as decoder
             if "decoder_config" not in kwargs:
                 raise ValueError(
-                    "To load an LSTM in Seq2seq model, please supply either: "
+                    "To load an LSTM in Encoder-Decoder model, please supply either: "
                     "    - a torch.nn.LSTM model as `decoder_model` parameter (`decoder_model=lstm_model`), or"
                     "    - a dictionary of configuration parameters that will be used to initialize a"
                     "      torch.nn.LSTM model as `decoder_config` keyword argument. "

From da10de8466c001dceca328dac12751abb71c65eb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Wed, 30 Oct 2019 11:19:58 +0100
Subject: [PATCH 130/144] fix bug with padding mask + add corresponding test

---
 examples/utils_summarization.py      | 6 +++---
 examples/utils_summarization_test.py | 7 +++++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/examples/utils_summarization.py b/examples/utils_summarization.py
index cd8bc4bc2b..2a8f81cd36 100644
--- a/examples/utils_summarization.py
+++ b/examples/utils_summarization.py
@@ -127,9 +127,9 @@ def build_lm_labels(sequence, pad_token):
 def build_mask(sequence, pad_token):
     """ Builds the mask. The attention mechanism will only attend to positions
     with value 1. """
-    mask = sequence.clone()
-    mask[mask != pad_token] = 1
-    mask[mask == pad_token] = 0
+    mask = torch.ones_like(sequence)
+    idx_pad_tokens = (sequence == pad_token)
+    mask[idx_pad_tokens] = 0
     return mask
 
 
diff --git a/examples/utils_summarization_test.py b/examples/utils_summarization_test.py
index 7a02f8fa1f..7604bd185d 100644
--- a/examples/utils_summarization_test.py
+++ b/examples/utils_summarization_test.py
@@ -116,6 +116,13 @@ class SummarizationDataProcessingTest(unittest.TestCase):
             build_mask(sequence, 23).numpy(), expected.numpy()
         )
 
+    def test_build_mask_with_padding_equal_to_one(self):
+        sequence = torch.tensor([8, 2, 3, 4, 1, 1, 1])
+        expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
+        np.testing.assert_array_equal(
+            build_mask(sequence, 1).numpy(), expected.numpy()
+        )
+
     def test_compute_token_type_ids(self):
         separator = 101
         batch = torch.tensor(

From 070507df1ffd7609f4691089f0bbc7ac27df66fb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Wed, 30 Oct 2019 11:24:12 +0100
Subject: [PATCH 131/144] format utils for summarization

---
 examples/utils_summarization.py      | 2 +-
 examples/utils_summarization_test.py | 8 ++------
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/examples/utils_summarization.py b/examples/utils_summarization.py
index 2a8f81cd36..327ca8cc3e 100644
--- a/examples/utils_summarization.py
+++ b/examples/utils_summarization.py
@@ -128,7 +128,7 @@ def build_mask(sequence, pad_token):
     """ Builds the mask. The attention mechanism will only attend to positions
     with value 1. """
     mask = torch.ones_like(sequence)
-    idx_pad_tokens = (sequence == pad_token)
+    idx_pad_tokens = sequence == pad_token
     mask[idx_pad_tokens] = 0
     return mask
 
diff --git a/examples/utils_summarization_test.py b/examples/utils_summarization_test.py
index 7604bd185d..1d56ff0803 100644
--- a/examples/utils_summarization_test.py
+++ b/examples/utils_summarization_test.py
@@ -105,9 +105,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
     def test_build_mask_no_padding(self):
         sequence = torch.tensor([1, 2, 3, 4])
         expected = torch.tensor([1, 1, 1, 1])
-        np.testing.assert_array_equal(
-            build_mask(sequence, 0).numpy(), expected.numpy()
-        )
+        np.testing.assert_array_equal(build_mask(sequence, 0).numpy(), expected.numpy())
 
     def test_build_mask(self):
         sequence = torch.tensor([1, 2, 3, 4, 23, 23, 23])
@@ -119,9 +117,7 @@ class SummarizationDataProcessingTest(unittest.TestCase):
     def test_build_mask_with_padding_equal_to_one(self):
         sequence = torch.tensor([8, 2, 3, 4, 1, 1, 1])
         expected = torch.tensor([1, 1, 1, 1, 0, 0, 0])
-        np.testing.assert_array_equal(
-            build_mask(sequence, 1).numpy(), expected.numpy()
-        )
+        np.testing.assert_array_equal(build_mask(sequence, 1).numpy(), expected.numpy())
 
     def test_compute_token_type_ids(self):
         separator = 101

From 7f4226f9e63639d23397ad89e2591b5d4fc35afc Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 30 Oct 2019 11:31:56 +0100
Subject: [PATCH 132/144] adding templates

---
 .../adding_a_new_example_script/README.md     |   5 +
 .../adding_a_new_example_script/run_xxx.py    | 553 ++++++++++
 .../adding_a_new_example_script/utils_xxx.py  | 995 ++++++++++++++++++
 templates/adding_a_new_model/README.md        |  62 ++
 .../adding_a_new_model/configuration_xxx.py   | 130 +++
 ...t_xxx_original_tf_checkpoint_to_pytorch.py |  65 ++
 .../adding_a_new_model/modeling_tf_xxx.py     | 500 +++++++++
 templates/adding_a_new_model/modeling_xxx.py  | 644 ++++++++++++
 .../tests/modeling_tf_xxx_test.py             | 256 +++++
 .../tests/modeling_xxx_test.py                | 255 +++++
 .../tests/tokenization_xxx_test.py            |  57 +
 .../adding_a_new_model/tokenization_xxx.py    | 218 ++++
 12 files changed, 3740 insertions(+)
 create mode 100644 templates/adding_a_new_example_script/README.md
 create mode 100644 templates/adding_a_new_example_script/run_xxx.py
 create mode 100644 templates/adding_a_new_example_script/utils_xxx.py
 create mode 100644 templates/adding_a_new_model/README.md
 create mode 100644 templates/adding_a_new_model/configuration_xxx.py
 create mode 100755 templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
 create mode 100644 templates/adding_a_new_model/modeling_tf_xxx.py
 create mode 100644 templates/adding_a_new_model/modeling_xxx.py
 create mode 100644 templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
 create mode 100644 templates/adding_a_new_model/tests/modeling_xxx_test.py
 create mode 100644 templates/adding_a_new_model/tests/tokenization_xxx_test.py
 create mode 100644 templates/adding_a_new_model/tokenization_xxx.py

diff --git a/templates/adding_a_new_example_script/README.md b/templates/adding_a_new_example_script/README.md
new file mode 100644
index 0000000000..2afca08bf8
--- /dev/null
+++ b/templates/adding_a_new_example_script/README.md
@@ -0,0 +1,5 @@
+# How to add a new example script in 🤗Transformers
+
+This folder provide a template for adding a new example script implementing a training or inference task with the models in the  🤗Transformers library.
+
+Currently only examples for PyTorch are provided which are adaptations of the library's SQuAD examples which implement single-GPU and distributed training with gradient accumulation and mixed-precision (using NVIDIA's apex library) to cover a reasonable range of use cases.
diff --git a/templates/adding_a_new_example_script/run_xxx.py b/templates/adding_a_new_example_script/run_xxx.py
new file mode 100644
index 0000000000..e348d9b5ea
--- /dev/null
+++ b/templates/adding_a_new_example_script/run_xxx.py
@@ -0,0 +1,553 @@
+# coding=utf-8
+# Copyright 2018 XXX.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Finetuning the library models for task XXX."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import os
+import random
+import glob
+
+import numpy as np
+import torch
+from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
+                              TensorDataset)
+from torch.utils.data.distributed import DistributedSampler
+
+try:
+    from torch.utils.tensorboard import SummaryWriter
+except:
+    from tensorboardX import SummaryWriter
+
+from tqdm import tqdm, trange
+
+from transformers import (WEIGHTS_NAME, BertConfig,
+                                  BertForQuestionAnswering, BertTokenizer,
+                                  XLMConfig, XLMForQuestionAnswering,
+                                  XLMTokenizer, XLNetConfig,
+                                  XLNetForQuestionAnswering,
+                                  XLNetTokenizer,
+                                  DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+
+from transformers import AdamW, WarmupLinearSchedule
+
+from utils_squad import (read_squad_examples, convert_examples_to_features,
+                         RawResult, write_predictions,
+                         RawResultExtended, write_predictions_extended)
+
+# The follwing import is the official SQuAD evaluation script (2.0).
+# You can remove it from the dependencies if you are using this script outside of the library
+# We've added it here for automated tests (see examples/test_examples.py file)
+from utils_squad_evaluate import EVAL_OPTS, main as evaluate_on_squad
+
+logger = logging.getLogger(__name__)
+
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) \
+                  for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+
+MODEL_CLASSES = {
+    'bert': (BertConfig, BertForQuestionAnswering, BertTokenizer),
+    'xlnet': (XLNetConfig, XLNetForQuestionAnswering, XLNetTokenizer),
+    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
+    'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer)
+}
+
+def set_seed(args):
+    random.seed(args.seed)
+    np.random.seed(args.seed)
+    torch.manual_seed(args.seed)
+    if args.n_gpu > 0:
+        torch.cuda.manual_seed_all(args.seed)
+
+def to_list(tensor):
+    return tensor.detach().cpu().tolist()
+
+def train(args, train_dataset, model, tokenizer):
+    """ Train the model """
+    if args.local_rank in [-1, 0]:
+        tb_writer = SummaryWriter()
+
+    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
+    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
+    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)
+
+    if args.max_steps > 0:
+        t_total = args.max_steps
+        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
+    else:
+        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs
+
+    # Prepare optimizer and schedule (linear warmup and decay)
+    no_decay = ['bias', 'LayerNorm.weight']
+    optimizer_grouped_parameters = [
+        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
+        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
+        ]
+    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
+    scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total)
+    if args.fp16:
+        try:
+            from apex import amp
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)
+
+    # multi-gpu training (should be after apex fp16 initialization)
+    if args.n_gpu > 1:
+        model = torch.nn.DataParallel(model)
+
+    # Distributed training (should be after apex fp16 initialization)
+    if args.local_rank != -1:
+        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
+                                                          output_device=args.local_rank,
+                                                          find_unused_parameters=True)
+
+    # Train!
+    logger.info("***** Running training *****")
+    logger.info("  Num examples = %d", len(train_dataset))
+    logger.info("  Num Epochs = %d", args.num_train_epochs)
+    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
+    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
+                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
+    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
+    logger.info("  Total optimization steps = %d", t_total)
+
+    global_step = 0
+    tr_loss, logging_loss = 0.0, 0.0
+    model.zero_grad()
+    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
+    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
+    for _ in train_iterator:
+        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
+        for step, batch in enumerate(epoch_iterator):
+            model.train()
+            batch = tuple(t.to(args.device) for t in batch)
+            inputs = {'input_ids':       batch[0],
+                      'attention_mask':  batch[1],
+                      'start_positions': batch[3],
+                      'end_positions':   batch[4]}
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]
+            if args.model_type in ['xlnet', 'xlm']:
+                inputs.update({'cls_index': batch[5],
+                               'p_mask':       batch[6]})
+            outputs = model(**inputs)
+            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
+
+            if args.n_gpu > 1:
+                loss = loss.mean() # mean() to average on multi-gpu parallel (not distributed) training
+            if args.gradient_accumulation_steps > 1:
+                loss = loss / args.gradient_accumulation_steps
+
+            if args.fp16:
+                with amp.scale_loss(loss, optimizer) as scaled_loss:
+                    scaled_loss.backward()
+            else:
+                loss.backward()
+
+            tr_loss += loss.item()
+            if (step + 1) % args.gradient_accumulation_steps == 0:
+                if args.fp16:
+                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
+                else:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
+
+                optimizer.step()
+                scheduler.step()  # Update learning rate schedule
+                model.zero_grad()
+                global_step += 1
+
+                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
+                    # Log metrics
+                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
+                        results = evaluate(args, model, tokenizer)
+                        for key, value in results.items():
+                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
+                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
+                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
+                    logging_loss = tr_loss
+
+                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
+                    # Save model checkpoint
+                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
+                    if not os.path.exists(output_dir):
+                        os.makedirs(output_dir)
+                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+                    model_to_save.save_pretrained(output_dir)
+                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
+                    logger.info("Saving model checkpoint to %s", output_dir)
+
+            if args.max_steps > 0 and global_step > args.max_steps:
+                epoch_iterator.close()
+                break
+        if args.max_steps > 0 and global_step > args.max_steps:
+            train_iterator.close()
+            break
+
+    if args.local_rank in [-1, 0]:
+        tb_writer.close()
+
+    return global_step, tr_loss / global_step
+
+
+def evaluate(args, model, tokenizer, prefix=""):
+    dataset, examples, features = load_and_cache_examples(args, tokenizer, evaluate=True, output_examples=True)
+
+    if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+        os.makedirs(args.output_dir)
+
+    args.eval_batch_size = args.per_gpu_eval_batch_size * max(1, args.n_gpu)
+    # Note that DistributedSampler samples randomly
+    eval_sampler = SequentialSampler(dataset) if args.local_rank == -1 else DistributedSampler(dataset)
+    eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
+
+    # Eval!
+    logger.info("***** Running evaluation {} *****".format(prefix))
+    logger.info("  Num examples = %d", len(dataset))
+    logger.info("  Batch size = %d", args.eval_batch_size)
+    all_results = []
+    for batch in tqdm(eval_dataloader, desc="Evaluating"):
+        model.eval()
+        batch = tuple(t.to(args.device) for t in batch)
+        with torch.no_grad():
+            inputs = {'input_ids':      batch[0],
+                      'attention_mask': batch[1]
+                      }
+            if args.model_type != 'distilbert':
+                inputs['token_type_ids'] = None if args.model_type == 'xlm' else batch[2]  # XLM don't use segment_ids
+            example_indices = batch[3]
+            if args.model_type in ['xlnet', 'xlm']:
+                inputs.update({'cls_index': batch[4],
+                               'p_mask':    batch[5]})
+            outputs = model(**inputs)
+
+        for i, example_index in enumerate(example_indices):
+            eval_feature = features[example_index.item()]
+            unique_id = int(eval_feature.unique_id)
+            if args.model_type in ['xlnet', 'xlm']:
+                # XLNet uses a more complex post-processing procedure
+                result = RawResultExtended(unique_id            = unique_id,
+                                           start_top_log_probs  = to_list(outputs[0][i]),
+                                           start_top_index      = to_list(outputs[1][i]),
+                                           end_top_log_probs    = to_list(outputs[2][i]),
+                                           end_top_index        = to_list(outputs[3][i]),
+                                           cls_logits           = to_list(outputs[4][i]))
+            else:
+                result = RawResult(unique_id    = unique_id,
+                                   start_logits = to_list(outputs[0][i]),
+                                   end_logits   = to_list(outputs[1][i]))
+            all_results.append(result)
+
+    # Compute predictions
+    output_prediction_file = os.path.join(args.output_dir, "predictions_{}.json".format(prefix))
+    output_nbest_file = os.path.join(args.output_dir, "nbest_predictions_{}.json".format(prefix))
+    if args.version_2_with_negative:
+        output_null_log_odds_file = os.path.join(args.output_dir, "null_odds_{}.json".format(prefix))
+    else:
+        output_null_log_odds_file = None
+
+    if args.model_type in ['xlnet', 'xlm']:
+        # XLNet uses a more complex post-processing procedure
+        write_predictions_extended(examples, features, all_results, args.n_best_size,
+                        args.max_answer_length, output_prediction_file,
+                        output_nbest_file, output_null_log_odds_file, args.predict_file,
+                        model.config.start_n_top, model.config.end_n_top,
+                        args.version_2_with_negative, tokenizer, args.verbose_logging)
+    else:
+        write_predictions(examples, features, all_results, args.n_best_size,
+                        args.max_answer_length, args.do_lower_case, output_prediction_file,
+                        output_nbest_file, output_null_log_odds_file, args.verbose_logging,
+                        args.version_2_with_negative, args.null_score_diff_threshold)
+
+    # Evaluate with the official SQuAD script
+    evaluate_options = EVAL_OPTS(data_file=args.predict_file,
+                                 pred_file=output_prediction_file,
+                                 na_prob_file=output_null_log_odds_file)
+    results = evaluate_on_squad(evaluate_options)
+    return results
+
+
+def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False):
+    if args.local_rank not in [-1, 0] and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Load data features from cache or dataset file
+    input_file = args.predict_file if evaluate else args.train_file
+    cached_features_file = os.path.join(os.path.dirname(input_file), 'cached_{}_{}_{}'.format(
+        'dev' if evaluate else 'train',
+        list(filter(None, args.model_name_or_path.split('/'))).pop(),
+        str(args.max_seq_length)))
+    if os.path.exists(cached_features_file) and not args.overwrite_cache and not output_examples:
+        logger.info("Loading features from cached file %s", cached_features_file)
+        features = torch.load(cached_features_file)
+    else:
+        logger.info("Creating features from dataset file at %s", input_file)
+        examples = read_squad_examples(input_file=input_file,
+                                                is_training=not evaluate,
+                                                version_2_with_negative=args.version_2_with_negative)
+        features = convert_examples_to_features(examples=examples,
+                                                tokenizer=tokenizer,
+                                                max_seq_length=args.max_seq_length,
+                                                doc_stride=args.doc_stride,
+                                                max_query_length=args.max_query_length,
+                                                is_training=not evaluate)
+        if args.local_rank in [-1, 0]:
+            logger.info("Saving features into cached file %s", cached_features_file)
+            torch.save(features, cached_features_file)
+
+    if args.local_rank == 0 and not evaluate:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training process the dataset, and the others will use the cache
+
+    # Convert to Tensors and build dataset
+    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
+    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
+    all_segment_ids = torch.tensor([f.segment_ids for f in features], dtype=torch.long)
+    all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
+    all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
+    if evaluate:
+        all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_example_index, all_cls_index, all_p_mask)
+    else:
+        all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
+        all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
+        dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
+                                all_start_positions, all_end_positions,
+                                all_cls_index, all_p_mask)
+
+    if output_examples:
+        return dataset, examples, features
+    return dataset
+
+
+def main():
+    parser = argparse.ArgumentParser()
+
+    ## Required parameters
+    parser.add_argument("--train_file", default=None, type=str, required=True,
+                        help="SQuAD json for training. E.g., train-v1.1.json")
+    parser.add_argument("--predict_file", default=None, type=str, required=True,
+                        help="SQuAD json for predictions. E.g., dev-v1.1.json or test-v1.1.json")
+    parser.add_argument("--model_type", default=None, type=str, required=True,
+                        help="Model type selected in the list: " + ", ".join(MODEL_CLASSES.keys()))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--output_dir", default=None, type=str, required=True,
+                        help="The output directory where the model checkpoints and predictions will be written.")
+
+    ## Other parameters
+    parser.add_argument("--config_name", default="", type=str,
+                        help="Pretrained config name or path if not the same as model_name")
+    parser.add_argument("--tokenizer_name", default="", type=str,
+                        help="Pretrained tokenizer name or path if not the same as model_name")
+    parser.add_argument("--cache_dir", default="", type=str,
+                        help="Where do you want to store the pre-trained models downloaded from s3")
+
+    parser.add_argument('--version_2_with_negative', action='store_true',
+                        help='If true, the SQuAD examples contain some that do not have an answer.')
+    parser.add_argument('--null_score_diff_threshold', type=float, default=0.0,
+                        help="If null_score - best_non_null is greater than the threshold predict null.")
+
+    parser.add_argument("--max_seq_length", default=384, type=int,
+                        help="The maximum total input sequence length after WordPiece tokenization. Sequences "
+                             "longer than this will be truncated, and sequences shorter than this will be padded.")
+    parser.add_argument("--doc_stride", default=128, type=int,
+                        help="When splitting up a long document into chunks, how much stride to take between chunks.")
+    parser.add_argument("--max_query_length", default=64, type=int,
+                        help="The maximum number of tokens for the question. Questions longer than this will "
+                             "be truncated to this length.")
+    parser.add_argument("--do_train", action='store_true',
+                        help="Whether to run training.")
+    parser.add_argument("--do_eval", action='store_true',
+                        help="Whether to run eval on the dev set.")
+    parser.add_argument("--evaluate_during_training", action='store_true',
+                        help="Rul evaluation during training at each logging step.")
+    parser.add_argument("--do_lower_case", action='store_true',
+                        help="Set this flag if you are using an uncased model.")
+
+    parser.add_argument("--per_gpu_train_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for training.")
+    parser.add_argument("--per_gpu_eval_batch_size", default=8, type=int,
+                        help="Batch size per GPU/CPU for evaluation.")
+    parser.add_argument("--learning_rate", default=5e-5, type=float,
+                        help="The initial learning rate for Adam.")
+    parser.add_argument('--gradient_accumulation_steps', type=int, default=1,
+                        help="Number of updates steps to accumulate before performing a backward/update pass.")
+    parser.add_argument("--weight_decay", default=0.0, type=float,
+                        help="Weight deay if we apply some.")
+    parser.add_argument("--adam_epsilon", default=1e-8, type=float,
+                        help="Epsilon for Adam optimizer.")
+    parser.add_argument("--max_grad_norm", default=1.0, type=float,
+                        help="Max gradient norm.")
+    parser.add_argument("--num_train_epochs", default=3.0, type=float,
+                        help="Total number of training epochs to perform.")
+    parser.add_argument("--max_steps", default=-1, type=int,
+                        help="If > 0: set total number of training steps to perform. Override num_train_epochs.")
+    parser.add_argument("--warmup_steps", default=0, type=int,
+                        help="Linear warmup over warmup_steps.")
+    parser.add_argument("--n_best_size", default=20, type=int,
+                        help="The total number of n-best predictions to generate in the nbest_predictions.json output file.")
+    parser.add_argument("--max_answer_length", default=30, type=int,
+                        help="The maximum length of an answer that can be generated. This is needed because the start "
+                             "and end predictions are not conditioned on one another.")
+    parser.add_argument("--verbose_logging", action='store_true',
+                        help="If true, all of the warnings related to data processing will be printed. "
+                             "A number of warnings are expected for a normal SQuAD evaluation.")
+
+    parser.add_argument('--logging_steps', type=int, default=50,
+                        help="Log every X updates steps.")
+    parser.add_argument('--save_steps', type=int, default=50,
+                        help="Save checkpoint every X updates steps.")
+    parser.add_argument("--eval_all_checkpoints", action='store_true',
+                        help="Evaluate all checkpoints starting with the same prefix as model_name ending and ending with step number")
+    parser.add_argument("--no_cuda", action='store_true',
+                        help="Whether not to use CUDA when available")
+    parser.add_argument('--overwrite_output_dir', action='store_true',
+                        help="Overwrite the content of the output directory")
+    parser.add_argument('--overwrite_cache', action='store_true',
+                        help="Overwrite the cached training and evaluation sets")
+    parser.add_argument('--seed', type=int, default=42,
+                        help="random seed for initialization")
+
+    parser.add_argument("--local_rank", type=int, default=-1,
+                        help="local_rank for distributed training on gpus")
+    parser.add_argument('--fp16', action='store_true',
+                        help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit")
+    parser.add_argument('--fp16_opt_level', type=str, default='O1',
+                        help="For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
+                             "See details at https://nvidia.github.io/apex/amp.html")
+    parser.add_argument('--server_ip', type=str, default='', help="Can be used for distant debugging.")
+    parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
+    args = parser.parse_args()
+
+    if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
+        raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
+
+    # Setup distant debugging if needed
+    if args.server_ip and args.server_port:
+        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
+        import ptvsd
+        print("Waiting for debugger attach")
+        ptvsd.enable_attach(address=(args.server_ip, args.server_port), redirect_output=True)
+        ptvsd.wait_for_attach()
+
+    # Setup CUDA, GPU & distributed training
+    if args.local_rank == -1 or args.no_cuda:
+        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
+        args.n_gpu = torch.cuda.device_count()
+    else:  # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
+        torch.cuda.set_device(args.local_rank)
+        device = torch.device("cuda", args.local_rank)
+        torch.distributed.init_process_group(backend='nccl')
+        args.n_gpu = 1
+    args.device = device
+
+    # Setup logging
+    logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
+                        datefmt = '%m/%d/%Y %H:%M:%S',
+                        level = logging.INFO if args.local_rank in [-1, 0] else logging.WARN)
+    logger.warning("Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
+                    args.local_rank, device, args.n_gpu, bool(args.local_rank != -1), args.fp16)
+
+    # Set seed
+    set_seed(args)
+
+    # Load pretrained model and tokenizer
+    if args.local_rank not in [-1, 0]:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    args.model_type = args.model_type.lower()
+    config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path, do_lower_case=args.do_lower_case)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
+
+    if args.local_rank == 0:
+        torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab
+
+    model.to(args.device)
+
+    logger.info("Training/evaluation parameters %s", args)
+
+    # Before we do anything with models, we want to ensure that we get fp16 execution of torch.einsum if args.fp16 is set.
+    # Otherwise it'll default to "promote" mode, and we'll get fp32 operations. Note that running `--fp16_opt_level="O2"` will
+    # remove the need for this code, but it is still valid.
+    if args.fp16:
+        try:
+            import apex
+            apex.amp.register_half_function(torch, 'einsum')
+        except ImportError:
+            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
+
+    # Training
+    if args.do_train:
+        train_dataset = load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=False)
+        global_step, tr_loss = train(args, train_dataset, model, tokenizer)
+        logger.info(" global_step = %s, average loss = %s", global_step, tr_loss)
+
+
+    # Save the trained model and the tokenizer
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
+        # Create output directory if needed
+        if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
+            os.makedirs(args.output_dir)
+
+        logger.info("Saving model checkpoint to %s", args.output_dir)
+        # Save a trained model, configuration and tokenizer using `save_pretrained()`.
+        # They can then be reloaded using `from_pretrained()`
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
+        tokenizer.save_pretrained(args.output_dir)
+
+        # Good practice: save your training arguments together with the trained model
+        torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))
+
+        # Load a trained model and vocabulary that you have fine-tuned
+        model = model_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
+        model.to(args.device)
+
+
+    # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
+    results = {}
+    if args.do_eval and args.local_rank in [-1, 0]:
+        checkpoints = [args.output_dir]
+        if args.eval_all_checkpoints:
+            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+
+        logger.info("Evaluate the following checkpoints: %s", checkpoints)
+
+        for checkpoint in checkpoints:
+            # Reload the model
+            global_step = checkpoint.split('-')[-1] if len(checkpoints) > 1 else ""
+            model = model_class.from_pretrained(checkpoint)
+            model.to(args.device)
+
+            # Evaluate
+            result = evaluate(args, model, tokenizer, prefix=global_step)
+
+            result = dict((k + ('_{}'.format(global_step) if global_step else ''), v) for k, v in result.items())
+            results.update(result)
+
+    logger.info("Results: {}".format(results))
+
+    return results
+
+
+if __name__ == "__main__":
+    main()
diff --git a/templates/adding_a_new_example_script/utils_xxx.py b/templates/adding_a_new_example_script/utils_xxx.py
new file mode 100644
index 0000000000..3f4145e028
--- /dev/null
+++ b/templates/adding_a_new_example_script/utils_xxx.py
@@ -0,0 +1,995 @@
+
+# coding=utf-8
+# Copyright 2018 XXX.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Load XXX dataset. """
+
+from __future__ import absolute_import, division, print_function
+
+import json
+import logging
+import math
+import collections
+from io import open
+
+from transformers.tokenization_bert import BasicTokenizer, whitespace_tokenize
+
+# Required by XLNet evaluation method to compute optimal threshold (see write_predictions_extended() method)
+from utils_squad_evaluate import find_all_best_thresh_v2, make_qid_to_has_ans, get_raw_scores
+
+logger = logging.getLogger(__name__)
+
+
+class SquadExample(object):
+    """
+    A single training/test example for the Squad dataset.
+    For examples without an answer, the start and end position are -1.
+    """
+
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = ""
+        s += "qas_id: %s" % (self.qas_id)
+        s += ", question_text: %s" % (
+            self.question_text)
+        s += ", doc_tokens: [%s]" % (" ".join(self.doc_tokens))
+        if self.start_position:
+            s += ", start_position: %d" % (self.start_position)
+        if self.end_position:
+            s += ", end_position: %d" % (self.end_position)
+        if self.is_impossible:
+            s += ", is_impossible: %r" % (self.is_impossible)
+        return s
+
+
+class InputFeatures(object):
+    """A single set of features of data."""
+
+    def __init__(self,
+                 unique_id,
+                 example_index,
+                 doc_span_index,
+                 tokens,
+                 token_to_orig_map,
+                 token_is_max_context,
+                 input_ids,
+                 input_mask,
+                 segment_ids,
+                 cls_index,
+                 p_mask,
+                 paragraph_len,
+                 start_position=None,
+                 end_position=None,
+                 is_impossible=None):
+        self.unique_id = unique_id
+        self.example_index = example_index
+        self.doc_span_index = doc_span_index
+        self.tokens = tokens
+        self.token_to_orig_map = token_to_orig_map
+        self.token_is_max_context = token_is_max_context
+        self.input_ids = input_ids
+        self.input_mask = input_mask
+        self.segment_ids = segment_ids
+        self.cls_index = cls_index
+        self.p_mask = p_mask
+        self.paragraph_len = paragraph_len
+        self.start_position = start_position
+        self.end_position = end_position
+        self.is_impossible = is_impossible
+
+
+def read_squad_examples(input_file, is_training, version_2_with_negative):
+    """Read a SQuAD json file into a list of SquadExample."""
+    with open(input_file, "r", encoding='utf-8') as reader:
+        input_data = json.load(reader)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for entry in input_data:
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            doc_tokens = []
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)
+
+            for qa in paragraph["qas"]:
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                is_impossible = False
+                if is_training:
+                    if version_2_with_negative:
+                        is_impossible = qa["is_impossible"]
+                    if (len(qa["answers"]) != 1) and (not is_impossible):
+                        raise ValueError(
+                            "For training, each question should have exactly 1 answer.")
+                    if not is_impossible:
+                        answer = qa["answers"][0]
+                        orig_answer_text = answer["text"]
+                        answer_offset = answer["answer_start"]
+                        answer_length = len(orig_answer_text)
+                        start_position = char_to_word_offset[answer_offset]
+                        end_position = char_to_word_offset[answer_offset + answer_length - 1]
+                        # Only add answers where the text can be exactly recovered from the
+                        # document. If this CAN'T happen it's likely due to weird Unicode
+                        # stuff so we will just skip the example.
+                        #
+                        # Note that this means for training mode, every example is NOT
+                        # guaranteed to be preserved.
+                        actual_text = " ".join(doc_tokens[start_position:(end_position + 1)])
+                        cleaned_answer_text = " ".join(
+                            whitespace_tokenize(orig_answer_text))
+                        if actual_text.find(cleaned_answer_text) == -1:
+                            logger.warning("Could not find answer: '%s' vs. '%s'",
+                                           actual_text, cleaned_answer_text)
+                            continue
+                    else:
+                        start_position = -1
+                        end_position = -1
+                        orig_answer_text = ""
+
+                example = SquadExample(
+                    qas_id=qas_id,
+                    question_text=question_text,
+                    doc_tokens=doc_tokens,
+                    orig_answer_text=orig_answer_text,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=is_impossible)
+                examples.append(example)
+    return examples
+
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length,
+                                 doc_stride, max_query_length, is_training,
+                                 cls_token_at_end=False,
+                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
+                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
+                                 cls_token_segment_id=0, pad_token_segment_id=0,
+                                 mask_padding_with_zero=True):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    unique_id = 1000000000
+    # cnt_pos, cnt_neg = 0, 0
+    # max_N, max_M = 1024, 1024
+    # f = np.zeros((max_N, max_M), dtype=np.float32)
+
+    features = []
+    for (example_index, example) in enumerate(examples):
+
+        # if example_index % 100 == 0:
+        #     logger.info('Converting %s/%s pos %s neg %s', example_index, len(examples), cnt_pos, cnt_neg)
+
+        query_tokens = tokenizer.tokenize(example.question_text)
+
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.tokenize(token)
+            for sub_token in sub_tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+
+        tok_start_position = None
+        tok_end_position = None
+        if is_training and example.is_impossible:
+            tok_start_position = -1
+            tok_end_position = -1
+        if is_training and not example.is_impossible:
+            tok_start_position = orig_to_tok_index[example.start_position]
+            if example.end_position < len(example.doc_tokens) - 1:
+                tok_end_position = orig_to_tok_index[example.end_position + 1] - 1
+            else:
+                tok_end_position = len(all_doc_tokens) - 1
+            (tok_start_position, tok_end_position) = _improve_answer_span(
+                all_doc_tokens, tok_start_position, tok_end_position, tokenizer,
+                example.orig_answer_text)
+
+        # The -3 accounts for [CLS], [SEP] and [SEP]
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+        # We can have documents that are longer than the maximum sequence length.
+        # To deal with this we do a sliding window approach, where we take chunks
+        # of the up to our max length with a stride of `doc_stride`.
+        _DocSpan = collections.namedtuple(  # pylint: disable=invalid-name
+            "DocSpan", ["start", "length"])
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            segment_ids = []
+
+            # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
+            # Original TF implem also keep the classification token (set to 0) (not sure why...)
+            p_mask = []
+
+            # CLS token at the beginning
+            if not cls_token_at_end:
+                tokens.append(cls_token)
+                segment_ids.append(cls_token_segment_id)
+                p_mask.append(0)
+                cls_index = 0
+
+            # Query
+            for token in query_tokens:
+                tokens.append(token)
+                segment_ids.append(sequence_a_segment_id)
+                p_mask.append(1)
+
+            # SEP token
+            tokens.append(sep_token)
+            segment_ids.append(sequence_a_segment_id)
+            p_mask.append(1)
+
+            # Paragraph
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(tokens)] = tok_to_orig_index[split_token_index]
+
+                is_max_context = _check_is_max_context(doc_spans, doc_span_index,
+                                                       split_token_index)
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                segment_ids.append(sequence_b_segment_id)
+                p_mask.append(0)
+            paragraph_len = doc_span.length
+
+            # SEP token
+            tokens.append(sep_token)
+            segment_ids.append(sequence_b_segment_id)
+            p_mask.append(1)
+
+            # CLS token at the end
+            if cls_token_at_end:
+                tokens.append(cls_token)
+                segment_ids.append(cls_token_segment_id)
+                p_mask.append(0)
+                cls_index = len(tokens) - 1  # Index of classification token
+
+            input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1 if mask_padding_with_zero else 0] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            while len(input_ids) < max_seq_length:
+                input_ids.append(pad_token)
+                input_mask.append(0 if mask_padding_with_zero else 1)
+                segment_ids.append(pad_token_segment_id)
+                p_mask.append(1)
+
+            assert len(input_ids) == max_seq_length
+            assert len(input_mask) == max_seq_length
+            assert len(segment_ids) == max_seq_length
+
+            span_is_impossible = example.is_impossible
+            start_position = None
+            end_position = None
+            if is_training and not span_is_impossible:
+                # For training, if our document chunk does not contain an annotation
+                # we throw it out, since there is nothing to predict.
+                doc_start = doc_span.start
+                doc_end = doc_span.start + doc_span.length - 1
+                out_of_span = False
+                if not (tok_start_position >= doc_start and
+                        tok_end_position <= doc_end):
+                    out_of_span = True
+                if out_of_span:
+                    start_position = 0
+                    end_position = 0
+                    span_is_impossible = True
+                else:
+                    doc_offset = len(query_tokens) + 2
+                    start_position = tok_start_position - doc_start + doc_offset
+                    end_position = tok_end_position - doc_start + doc_offset
+
+            if is_training and span_is_impossible:
+                start_position = cls_index
+                end_position = cls_index
+
+            if example_index < 20:
+                logger.info("*** Example ***")
+                logger.info("unique_id: %s" % (unique_id))
+                logger.info("example_index: %s" % (example_index))
+                logger.info("doc_span_index: %s" % (doc_span_index))
+                logger.info("tokens: %s" % " ".join(tokens))
+                logger.info("token_to_orig_map: %s" % " ".join([
+                    "%d:%d" % (x, y) for (x, y) in token_to_orig_map.items()]))
+                logger.info("token_is_max_context: %s" % " ".join([
+                    "%d:%s" % (x, y) for (x, y) in token_is_max_context.items()
+                ]))
+                logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+                logger.info(
+                    "input_mask: %s" % " ".join([str(x) for x in input_mask]))
+                logger.info(
+                    "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+                if is_training and span_is_impossible:
+                    logger.info("impossible example")
+                if is_training and not span_is_impossible:
+                    answer_text = " ".join(tokens[start_position:(end_position + 1)])
+                    logger.info("start_position: %d" % (start_position))
+                    logger.info("end_position: %d" % (end_position))
+                    logger.info(
+                        "answer: %s" % (answer_text))
+
+            features.append(
+                InputFeatures(
+                    unique_id=unique_id,
+                    example_index=example_index,
+                    doc_span_index=doc_span_index,
+                    tokens=tokens,
+                    token_to_orig_map=token_to_orig_map,
+                    token_is_max_context=token_is_max_context,
+                    input_ids=input_ids,
+                    input_mask=input_mask,
+                    segment_ids=segment_ids,
+                    cls_index=cls_index,
+                    p_mask=p_mask,
+                    paragraph_len=paragraph_len,
+                    start_position=start_position,
+                    end_position=end_position,
+                    is_impossible=span_is_impossible))
+            unique_id += 1
+
+    return features
+
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
+                         orig_answer_text):
+    """Returns tokenized answer spans that better match the annotated answer."""
+
+    # The SQuAD annotations are character based. We first project them to
+    # whitespace-tokenized words. But then after WordPiece tokenization, we can
+    # often find a "better match". For example:
+    #
+    #   Question: What year was John Smith born?
+    #   Context: The leader was John Smith (1895-1943).
+    #   Answer: 1895
+    #
+    # The original whitespace-tokenized answer will be "(1895-1943).". However
+    # after tokenization, our tokens will be "( 1895 - 1943 ) .". So we can match
+    # the exact answer, 1895.
+    #
+    # However, this is not always possible. Consider the following:
+    #
+    #   Question: What country is the top exporter of electornics?
+    #   Context: The Japanese electronics industry is the lagest in the world.
+    #   Answer: Japan
+    #
+    # In this case, the annotator chose "Japan" as a character sub-span of
+    # the word "Japanese". Since our WordPiece tokenizer does not split
+    # "Japanese", we just use "Japanese" as the annotation. This is fairly rare
+    # in SQuAD, but does happen.
+    tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
+
+    for new_start in range(input_start, input_end + 1):
+        for new_end in range(input_end, new_start - 1, -1):
+            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            if text_span == tok_answer_text:
+                return (new_start, new_end)
+
+    return (input_start, input_end)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context, num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+def write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file, output_null_log_odds_file, verbose_logging,
+                      version_2_with_negative, null_score_diff_threshold):
+    """Write final predictions to the json file and log-odds of null if needed."""
+    logger.info("Writing predictions to: %s" % (output_prediction_file))
+    logger.info("Writing nbest to: %s" % (output_nbest_file))
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction",
+        ["feature_index", "start_index", "end_index", "start_logit", "end_logit"])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+        min_null_feature_index = 0  # the paragraph slice with min null score
+        null_start_logit = 0  # the start logit at the slice with min null score
+        null_end_logit = 0  # the end logit at the slice with min null score
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            # if we could have irrelevant answers, get the min score of irrelevant
+            if version_2_with_negative:
+                feature_null_score = result.start_logits[0] + result.end_logits[0]
+                if feature_null_score < score_null:
+                    score_null = feature_null_score
+                    min_null_feature_index = feature_index
+                    null_start_logit = result.start_logits[0]
+                    null_end_logit = result.end_logits[0]
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+        if version_2_with_negative:
+            prelim_predictions.append(
+                _PrelimPrediction(
+                    feature_index=min_null_feature_index,
+                    start_index=0,
+                    end_index=0,
+                    start_logit=null_start_logit,
+                    end_logit=null_end_logit))
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_logit + x.end_logit),
+            reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+            if pred.start_index > 0:  # this is a non-null prediction
+                tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+                orig_doc_start = feature.token_to_orig_map[pred.start_index]
+                orig_doc_end = feature.token_to_orig_map[pred.end_index]
+                orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+                tok_text = " ".join(tok_tokens)
+
+                # De-tokenize WordPieces that have been split off.
+                tok_text = tok_text.replace(" ##", "")
+                tok_text = tok_text.replace("##", "")
+
+                # Clean whitespace
+                tok_text = tok_text.strip()
+                tok_text = " ".join(tok_text.split())
+                orig_text = " ".join(orig_tokens)
+
+                final_text = get_final_text(tok_text, orig_text, do_lower_case, verbose_logging)
+                if final_text in seen_predictions:
+                    continue
+
+                seen_predictions[final_text] = True
+            else:
+                final_text = ""
+                seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_logit=pred.start_logit,
+                    end_logit=pred.end_logit))
+        # if we didn't include the empty option in the n-best, include it
+        if version_2_with_negative:
+            if "" not in seen_predictions:
+                nbest.append(
+                    _NbestPrediction(
+                        text="",
+                        start_logit=null_start_logit,
+                        end_logit=null_end_logit))
+                
+            # In very rare edge cases we could only have single null prediction.
+            # So we just create a nonce prediction in this case to avoid failure.
+            if len(nbest)==1:
+                nbest.insert(0,
+                    _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+            if not best_non_null_entry:
+                if entry.text:
+                    best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = entry.start_logit
+            output["end_logit"] = entry.end_logit
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+
+        if not version_2_with_negative:
+            all_predictions[example.qas_id] = nbest_json[0]["text"]
+        else:
+            # predict "" iff the null score - the score of best non-null > threshold
+            score_diff = score_null - best_non_null_entry.start_logit - (
+                best_non_null_entry.end_logit)
+            scores_diff_json[example.qas_id] = score_diff
+            if score_diff > null_score_diff_threshold:
+                all_predictions[example.qas_id] = ""
+            else:
+                all_predictions[example.qas_id] = best_non_null_entry.text
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    return all_predictions
+
+
+# For XLNet (and XLM which uses the same head)
+RawResultExtended = collections.namedtuple("RawResultExtended",
+    ["unique_id", "start_top_log_probs", "start_top_index",
+     "end_top_log_probs", "end_top_index", "cls_logits"])
+
+
+def write_predictions_extended(all_examples, all_features, all_results, n_best_size,
+                                max_answer_length, output_prediction_file,
+                                output_nbest_file,
+                                output_null_log_odds_file, orig_data_file,
+                                start_n_top, end_n_top, version_2_with_negative,
+                                tokenizer, verbose_logging):
+    """ XLNet write prediction logic (more complex than Bert's).
+        Write final predictions to the json file and log-odds of null if needed.
+
+        Requires utils_squad_evaluate.py
+    """
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction",
+        ["feature_index", "start_index", "end_index",
+        "start_log_prob", "end_log_prob"])
+
+    _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "NbestPrediction", ["text", "start_log_prob", "end_log_prob"])
+
+    logger.info("Writing predictions to: %s", output_prediction_file)
+    # logger.info("Writing nbest to: %s" % (output_nbest_file))
+
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    scores_diff_json = collections.OrderedDict()
+
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+
+        prelim_predictions = []
+        # keep track of the minimum score of null start+end of position 0
+        score_null = 1000000  # large and positive
+
+        for (feature_index, feature) in enumerate(features):
+            result = unique_id_to_result[feature.unique_id]
+
+            cur_null_score = result.cls_logits
+
+            # if we could have irrelevant answers, get the min score of irrelevant
+            score_null = min(score_null, cur_null_score)
+
+            for i in range(start_n_top):
+                for j in range(end_n_top):
+                    start_log_prob = result.start_top_log_probs[i]
+                    start_index = result.start_top_index[i]
+
+                    j_index = i * end_n_top + j
+
+                    end_log_prob = result.end_top_log_probs[j_index]
+                    end_index = result.end_top_index[j_index]
+
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= feature.paragraph_len - 1:
+                        continue
+                    if end_index >= feature.paragraph_len - 1:
+                        continue
+
+                    if not feature.token_is_max_context.get(start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_log_prob=start_log_prob,
+                            end_log_prob=end_log_prob))
+
+        prelim_predictions = sorted(
+            prelim_predictions,
+            key=lambda x: (x.start_log_prob + x.end_log_prob),
+            reverse=True)
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+
+            # XLNet un-tokenizer
+            # Let's keep it simple for now and see if we need all this later.
+            # 
+            # tok_start_to_orig_index = feature.tok_start_to_orig_index
+            # tok_end_to_orig_index = feature.tok_end_to_orig_index
+            # start_orig_pos = tok_start_to_orig_index[pred.start_index]
+            # end_orig_pos = tok_end_to_orig_index[pred.end_index]
+            # paragraph_text = example.paragraph_text
+            # final_text = paragraph_text[start_orig_pos: end_orig_pos + 1].strip()
+
+            # Previously used Bert untokenizer
+            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            tok_text = tokenizer.convert_tokens_to_string(tok_tokens)
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
+                                        verbose_logging)
+
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions[final_text] = True
+
+            nbest.append(
+                _NbestPrediction(
+                    text=final_text,
+                    start_log_prob=pred.start_log_prob,
+                    end_log_prob=pred.end_log_prob))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="", start_log_prob=-1e6,
+                end_log_prob=-1e6))
+
+        total_scores = []
+        best_non_null_entry = None
+        for entry in nbest:
+            total_scores.append(entry.start_log_prob + entry.end_log_prob)
+            if not best_non_null_entry:
+                best_non_null_entry = entry
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_log_prob"] = entry.start_log_prob
+            output["end_log_prob"] = entry.end_log_prob
+            nbest_json.append(output)
+
+        assert len(nbest_json) >= 1
+        assert best_non_null_entry is not None
+
+        score_diff = score_null
+        scores_diff_json[example.qas_id] = score_diff
+        # note(zhiliny): always predict best_non_null_entry
+        # and the evaluation script will search for the best threshold
+        all_predictions[example.qas_id] = best_non_null_entry.text
+
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+    if version_2_with_negative:
+        with open(output_null_log_odds_file, "w") as writer:
+            writer.write(json.dumps(scores_diff_json, indent=4) + "\n")
+
+    with open(orig_data_file, "r", encoding='utf-8') as reader:
+        orig_data = json.load(reader)["data"]
+
+    qid_to_has_ans = make_qid_to_has_ans(orig_data)
+    has_ans_qids = [k for k, v in qid_to_has_ans.items() if v]
+    no_ans_qids = [k for k, v in qid_to_has_ans.items() if not v]
+    exact_raw, f1_raw = get_raw_scores(orig_data, all_predictions)
+    out_eval = {}
+
+    find_all_best_thresh_v2(out_eval, all_predictions, exact_raw, f1_raw, scores_diff_json, qid_to_has_ans)
+
+    return out_eval
+
+
+def get_final_text(pred_text, orig_text, do_lower_case, verbose_logging=False):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heuristic between
+    # `pred_text` and `orig_text` to get a character-to-character alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = BasicTokenizer(do_lower_case=do_lower_case)
+
+    tok_text = " ".join(tokenizer.tokenize(orig_text))
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        if verbose_logging:
+            logger.info(
+                "Unable to find text: '%s' in '%s'" % (pred_text, orig_text))
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        if verbose_logging:
+            logger.info("Length not equal after stripping spaces: '%s' vs '%s'",
+                        orig_ns_text, tok_ns_text)
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in tok_ns_to_s_map.items():
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map start position")
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        if verbose_logging:
+            logger.info("Couldn't map end position")
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits), key=lambda x: x[1], reverse=True)
+
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
diff --git a/templates/adding_a_new_model/README.md b/templates/adding_a_new_model/README.md
new file mode 100644
index 0000000000..1569b51e89
--- /dev/null
+++ b/templates/adding_a_new_model/README.md
@@ -0,0 +1,62 @@
+# How to add a new model in 🤗Transformers
+
+This folder describes the process to add a new model in 🤗Transformers and provide templates for the required files.
+
+The library is designed to incorporate a variety of models and code bases. As such the process for adding a new model usually mostly consists in copy-pasting to relevant original code in the various sections of the templates included in the present repository.
+
+One important point though is that the library has the following goals impacting the way models are incorporated:
+
+- one specific feature of the API is the capability to run the model and tokenizer inline. The tokenization code thus often have to be slightly adapted to allow for running in the python interpreter.
+- the package is also designed to be as self-consistent and with a small and reliable set of packages dependencies. In consequence, additional dependencies are usually not allowed when adding a model but can be allowed for the inclusion of a new tokenizer (recent examples of dependencies added for tokenizer specificites includes `sentencepiece` and `sacremoses`). Please make sure to check the existing dependencies when possible before adding a new one.
+
+For a quick overview of the library organization, please check the [QuickStart section of the documentation](https://huggingface.co/transformers/quickstart.html).
+
+# Typical workflow for including a model
+
+Here an overview of the general workflow: 
+
+- [ ] add model/configuration/tokenization classes
+- [ ] add conversion scripts
+- [ ] add tests
+- [ ] finalize
+
+Let's details what should be done at each step
+
+## Adding model/configuration/tokenization classes
+
+Here is the workflow for adding model/configuration/tokenization classes:
+
+- [ ] copy the python files from the present folder to the main folder and rename them, replacing `xxx` with your model name,
+- [ ] edit the files to replace `XXX` (with various casing) with your model name
+- [ ] copy-past or create a simple configuration class for your model in the `configuration_...` file
+- [ ] copy-past or create the code for your model in the `modeling_...` files (PyTorch and TF 2.0)
+- [ ] copy-past or create a tokenizer class for your model in the `tokenization_...` file
+
+# Adding conversion scripts
+
+Here is the workflow for the conversion scripts:
+
+- [ ] copy the conversion script (`convert_...`) from the present folder to the main folder.
+- [ ] edit this scipt to convert your original checkpoint weights to the current pytorch ones.
+
+# Adding tests:
+
+Here is the workflow for the adding tests:
+
+- [ ] copy the python files from the `tests` sub-folder of the present folder to the `tests` subfolder of the main folder and rename them, replacing `xxx` with your model name,
+- [ ] edit the tests files to replace `XXX` (with various casing) with your model name
+- [ ] edit the tests code as needed
+
+# Final steps
+
+You can then finish the addition step by adding imports for your classes in the common files:
+
+- [ ] add import for all the relevant classes in `__init__.py`
+- [ ] add your configuration in `configuration_auto.py`
+- [ ] add your PyTorch and TF 2.0 model respectively in `modeling_auto.py` and `modeling_tf_auto.py`
+- [ ] add your tokenizer in `tokenization_auto.py`
+- [ ] add your models and tokenizer to `pipeline.py`
+- [ ] add a link to your conversion script in the main conversion utility (currently in `__main__` but will be moved to the `commands` subfolder in the near future)
+- [ ] edit the PyTorch to TF 2.0 conversion script to add your model in the `convert_pytorch_checkpoint_to_tf2.py` file
+- [ ] add a mention of your model in the doc: `README.md` and the documentation it-self at `docs/source/pretrained_models.rst`.
+- [ ] upload the pretrained weigths, configurations and vocabulary files.
diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py
new file mode 100644
index 0000000000..b1614e71af
--- /dev/null
+++ b/templates/adding_a_new_model/configuration_xxx.py
@@ -0,0 +1,130 @@
+# coding=utf-8
+# Copyright 2010, XXX authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XXX model configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+import six
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+XXX_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-config.json",
+    'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-config.json",
+}
+
+
+class XxxConfig(PretrainedConfig):
+    r"""
+        :class:`~transformers.XxxConfig` is the configuration class to store the configuration of a
+        `XxxModel`.
+
+
+        Arguments:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XxxModel`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `XxxModel`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
+    """
+    pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=50257,
+                 n_positions=1024,
+                 n_ctx=1024,
+                 n_embd=768,
+                 n_layer=12,
+                 n_head=12,
+                 resid_pdrop=0.1,
+                 embd_pdrop=0.1,
+                 attn_pdrop=0.1,
+                 layer_norm_epsilon=1e-5,
+                 initializer_range=0.02,
+
+                 num_labels=1,
+                 summary_type='cls_index',
+                 summary_use_proj=True,
+                 summary_activation=None,
+                 summary_proj_to_labels=True,
+                 summary_first_dropout=0.1,
+                 **kwargs):
+        super(XxxConfig, self).__init__(**kwargs)
+        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, six.string_types) else -1
+        self.n_ctx = n_ctx
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+
+        self.num_labels = num_labels
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
+        if isinstance(vocab_size_or_config_json_file, six.string_types):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif not isinstance(vocab_size_or_config_json_file, int):
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
+    @property
+    def hidden_size(self):
+        return self.n_embd
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
diff --git a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
new file mode 100755
index 0000000000..d50d129cba
--- /dev/null
+++ b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert XXX checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import torch
+
+from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, xxx_config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = XxxConfig.from_json_file(xxx_config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = XxxForPreTraining(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_xxx(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the TensorFlow checkpoint path.")
+    parser.add_argument("--xxx_config_file",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "The config json file corresponding to the pre-trained XXX model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                     args.xxx_config_file,
+                                     args.pytorch_dump_path)
diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
new file mode 100644
index 0000000000..c661975768
--- /dev/null
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -0,0 +1,500 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 XXX model. """
+
+####################################################
+# In this template, replace all the XXX (various casings) with your model name
+####################################################
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import numpy as np
+import tensorflow as tf
+
+from .configuration_xxx import XxxConfig
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+####################################################
+# This dict contrains shortcut names and associated url
+# for the pretrained weights provided with the models
+####################################################
+TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-tf_model.h5",
+    'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-tf_model.h5",
+}
+
+####################################################
+# TF 2.0 Models are constructed using Keras imperative API by sub-classing
+# - tf.keras.layers.Layer for the layers and
+# - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model)
+####################################################
+
+####################################################
+# Here is an example of typical layer in a TF 2.0 model of the library
+# The classes are usually identical to the PyTorch ones and prefixed with 'TF'.
+#
+# Note that class __init__ parameters includes **kwargs (send to 'super').
+# This let us have a control on class scope and variable names:
+# More precisely, we set the names of the class attributes (lower level layers) to
+# to the equivalent attributes names in the PyTorch model so we can have equivalent
+# class and scope structure between PyTorch and TF 2.0 models and easily load one in the other.
+#
+# See the conversion methods in modeling_tf_pytorch_utils.py for more details
+####################################################
+class TFXxxLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFXxxLayer, self).__init__(**kwargs)
+        self.attention = TFXxxAttention(config, name='attention')
+        self.intermediate = TFXxxIntermediate(config, name='intermediate')
+        self.transformer_output = TFXxxOutput(config, name='output')
+
+    def call(self, inputs, training=False):
+        hidden_states, attention_mask, head_mask = inputs
+
+        attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.transformer_output([intermediate_output, attention_output], training=training)
+        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+####################################################
+# The full model without a specific pretrained or finetuning head is
+# provided as a tf.keras.layers.Layer usually called "TFXxxMainLayer"
+####################################################
+class TFXxxMainLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFXxxMainLayer, self).__init__(**kwargs)
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
+
+    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
+        # We allow three types of multi-inputs:
+        # - traditional keyword arguments in the call method
+        # - all the arguments provided as a dict in the first positional argument of call
+        # - all the arguments provided as a list/tuple (ordered) in the first positional argument of call
+        # The last two options are useful to use the tf.keras fit() method.
+
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+            position_ids = inputs[3] if len(inputs) > 3 else position_ids
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            assert len(inputs) <= 5, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            attention_mask = inputs.get('attention_mask', attention_mask)
+            token_type_ids = inputs.get('token_type_ids', token_type_ids)
+            position_ids = inputs.get('position_ids', position_ids)
+            head_mask = inputs.get('head_mask', head_mask)
+            assert len(inputs) <= 5, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if attention_mask is None:
+            attention_mask = tf.fill(tf.shape(input_ids), 1)
+        if token_type_ids is None:
+            token_type_ids = tf.fill(tf.shape(input_ids), 0)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+
+        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if not head_mask is None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+            # head_mask = tf.constant([0] * self.num_hidden_layers)
+
+        ##################################
+        # Replace this with your model code
+        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+        encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
+        sequence_output = encoder_outputs[0]
+        outputs = (sequence_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+
+        return outputs  # sequence_output, (hidden_states), (attentions)
+
+
+####################################################
+# TFXxxPreTrainedModel is a sub-class of tf.keras.Model
+# which take care of loading and saving pretrained weights
+# and various common utilities.
+# Here you just need to specify a few (self-explanatory)
+# pointers for your model.
+####################################################
+class TFXxxPreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = XxxConfig
+    pretrained_model_archive_map = TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "transformer"
+
+
+XXX_START_DOCSTRING = r"""    The XXX model was proposed in
+    `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
+    by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
+    pre-trained using a combination of masked language modeling objective and next sentence prediction
+    on a large corpus comprising the Toronto Book Corpus and Wikipedia.
+
+    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. _`XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`:
+        https://arxiv.org/abs/1810.04805
+
+    .. _`tf.keras.Model`:
+        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
+
+    Note on the model inputs:
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
+            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+XXX_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, XXX input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+                
+                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+                
+                ``token_type_ids:   0   0   0   0  0     0   0``
+
+            Xxx is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
+            Indices can be obtained using :class:`transformers.XxxTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.",
+                      XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class TFXxxModel(TFXxxPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Xxx pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XxxTokenizer, TFXxxModel
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = TFXxxModel.from_pretrained('xxx-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXxxModel, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFXxxMainLayer(config, name='transformer')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+        return outputs
+
+
+@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """,
+    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class TFXxxForMaskedLM(TFXxxPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XxxTokenizer, TFXxxForMaskedLM
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = TFXxxForMaskedLM.from_pretrained('xxx-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXxxForMaskedLM, self).__init__(config, *inputs, **kwargs)
+
+        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name='mlm')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+
+        return outputs  # prediction_scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class TFXxxForSequenceClassification(TFXxxPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **logits**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XxxTokenizer, TFXxxForSequenceClassification
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = TFXxxForSequenceClassification.from_pretrained('xxx-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        logits = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXxxForSequenceClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='classifier')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
+        logits = self.classifier(pooled_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Xxx Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class TFXxxForTokenClassification(TFXxxPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XxxTokenizer, TFXxxForTokenClassification
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = TFXxxForTokenClassification.from_pretrained('xxx-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        scores = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXxxForTokenClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='classifier')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class TFXxxForQuestionAnswering(TFXxxPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **start_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XxxTokenizer, TFXxxForQuestionAnswering
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = TFXxxForQuestionAnswering.from_pretrained('xxx-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        start_scores, end_scores = outputs[:2]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXxxForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='qa_outputs')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        outputs = (start_logits, end_logits,) + outputs[2:]
+
+        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
new file mode 100644
index 0000000000..7e2ba9dfb5
--- /dev/null
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -0,0 +1,644 @@
+# coding=utf-8
+# Copyright 2018 XXX Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch XXX model. """
+
+####################################################
+# In this template, replace all the XXX (various casings) with your model name
+####################################################
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from .modeling_utils import PreTrainedModel, prune_linear_layer
+from .configuration_xxx import XxxConfig
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+####################################################
+# This dict contrains shortcut names and associated url
+# for the pretrained weights provided with the models
+####################################################
+XXX_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-pytorch_model.bin",
+    'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-pytorch_model.bin",
+}
+
+####################################################
+# This is a conversion method from TF 1.0 to PyTorch
+# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
+####################################################
+def load_tf_weights_in_xxx(model, config, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model.
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
+            logger.info("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, l[0])
+                except AttributeError:
+                    logger.info("Skipping {}".format("/".join(name)))
+                    continue
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+####################################################
+# PyTorch Models are constructed by sub-classing
+# - torch.nn.Module for the layers and
+# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
+####################################################
+
+####################################################
+# Here is an example of typical layer in a PyTorch model of the library
+# The classes are usually identical to the TF 2.0 ones without the 'TF' prefix.
+#
+# See the conversion methods in modeling_tf_pytorch_utils.py for more details
+####################################################
+class XxxLayer(nn.Module):
+    def __init__(self, config):
+        super(XxxLayer, self).__init__()
+        self.attention = XxxAttention(config)
+        self.intermediate = XxxIntermediate(config)
+        self.output = XxxOutput(config)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+
+####################################################
+# PreTrainedModel is a sub-class of torch.nn.Module
+# which take care of loading and saving pretrained weights
+# and various common utilities.
+#
+# Here you just need to specify a few (self-explanatory)
+# pointers for your model and the weights initialization
+# method if its not fully covered by PreTrainedModel's default method
+####################################################
+class XxxPreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = XxxConfig
+    pretrained_model_archive_map = XXX_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_xxx
+    base_model_prefix = "transformer"
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, XxxLayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+XXX_START_DOCSTRING = r"""    The XXX model was proposed in
+    `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
+    by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
+    pre-trained using a combination of masked language modeling objective and next sentence prediction
+    on a large corpus comprising the Toronto Book Corpus and Wikipedia.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`:
+        https://arxiv.org/abs/1810.04805
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+XXX_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, XXX input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+                
+                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+                
+                ``token_type_ids:   0   0   0   0  0     0   0``
+
+            Xxx is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
+            Indices can be obtained using :class:`transformers.XxxTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare Xxx Model transformer outputting raw hidden-states without any specific head on top.",
+                      XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class XxxModel(XxxPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Xxx pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = XxxModel.from_pretrained('xxx-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config):
+        super(XxxModel, self).__init__(config)
+
+        self.embeddings = XxxEmbeddings(config)
+        self.encoder = XxxEncoder(config)
+        self.pooler = XxxPooler(config)
+
+        self.init_weights()
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        old_embeddings = self.embeddings.word_embeddings
+        new_embeddings = self._get_resized_embeddings(old_embeddings, new_num_tokens)
+        self.embeddings.word_embeddings = new_embeddings
+        return self.embeddings.word_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        ##################################
+        # Replace this with your model code
+        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+        encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)
+        sequence_output = encoder_outputs[0]
+        outputs = (sequence_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+
+        return outputs  # sequence_output, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """,
+    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class XxxForMaskedLM(XxxPreTrainedModel):
+    r"""
+        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = XxxForMaskedLM.from_pretrained('xxx-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, masked_lm_labels=input_ids)
+        loss, prediction_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(XxxForMaskedLM, self).__init__(config)
+
+        self.transformer = XxxModel(config)
+        self.cls = XxxOnlyMLMHead(config)
+
+        self.init_weights()
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.cls.predictions.decoder,
+                                   self.transformer.embeddings.word_embeddings)
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                masked_lm_labels=None):
+
+        outputs = self.transformer(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+        if masked_lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            outputs = (masked_lm_loss,) + outputs
+
+        return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class XxxForSequenceClassification(XxxPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = XxxForSequenceClassification.from_pretrained('xxx-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(XxxForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = XxxModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, self.config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
+                position_ids=None, head_mask=None, labels=None):
+
+        outputs = self.transformer(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Xxx Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class XxxForTokenClassification(XxxPreTrainedModel):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = XxxForTokenClassification.from_pretrained('xxx-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(XxxForTokenClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = XxxModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None,
+                position_ids=None, head_mask=None, labels=None):
+
+        outputs = self.transformer(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            # Only keep active parts of the loss
+            if attention_mask is not None:
+                active_loss = attention_mask.view(-1) == 1
+                active_logits = logits.view(-1, self.num_labels)[active_loss]
+                active_labels = labels.view(-1)[active_loss]
+                loss = loss_fct(active_logits, active_labels)
+            else:
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class XxxForQuestionAnswering(XxxPreTrainedModel):
+    r"""
+        **start_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+        **end_positions**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`).
+            Position outside of the sequence are not taken into account for computing the loss.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+        **start_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = XxxForQuestionAnswering.from_pretrained('xxx-large-uncased-whole-word-masking-finetuned-squad')
+        question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
+        input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
+        input_ids = tokenizer.encode(input_text)
+        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] 
+        start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
+        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)  
+        print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
+        # a nice puppet
+
+
+    """
+    def __init__(self, config):
+        super(XxxForQuestionAnswering, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.transformer = XxxModel(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.init_weights()
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                start_positions=None, end_positions=None):
+
+        outputs = self.transformer(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        outputs = (start_logits, end_logits,) + outputs[2:]
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions.clamp_(0, ignored_index)
+            end_positions.clamp_(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+            outputs = (total_loss,) + outputs
+
+        return outputs  # (loss), start_logits, end_logits, (hidden_states), (attentions)
diff --git a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
new file mode 100644
index 0000000000..90837ca1ea
--- /dev/null
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -0,0 +1,256 @@
+# coding=utf-8
+# Copyright 2018 XXX Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+import sys
+
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+from transformers import XxxConfig, is_tf_available
+
+if is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_xxx import (TFXxxModel, TFXxxForMaskedLM,
+                                               TFXxxForSequenceClassification,
+                                               TFXxxForTokenClassification,
+                                               TFXxxForQuestionAnswering,
+                                               TF_XXX_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+
+
+class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
+
+    all_model_classes = (TFXxxModel, TFXxxForMaskedLM, TFXxxForQuestionAnswering,
+                         TFXxxForSequenceClassification,
+                         TFXxxForTokenClassification) if is_tf_available() else ()
+
+    class TFXxxModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = XxxConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range)
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFXxxModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            sequence_output, pooled_output = model(inputs)
+
+            inputs = [input_ids, input_mask]
+            sequence_output, pooled_output = model(inputs)
+
+            sequence_output, pooled_output = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output.numpy(),
+                "pooled_output": pooled_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
+
+
+        def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFXxxForMaskedLM(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            prediction_scores, = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+
+        def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = TFXxxForSequenceClassification(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            logits, = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.num_labels])
+
+
+        def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = TFXxxForTokenClassification(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            logits, = model(inputs)
+            result = {
+                "logits": logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].shape),
+                [self.batch_size, self.seq_length, self.num_labels])
+
+
+        def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFXxxForQuestionAnswering(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            start_logits, end_logits = model(inputs)
+            result = {
+                "start_logits": start_logits.numpy(),
+                "end_logits": end_logits.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].shape),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].shape),
+                [self.batch_size, self.seq_length])
+
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = TFXxxModelTest.TFXxxModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XxxConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_xxx_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xxx_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xxx_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xxx_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xxx_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in ['xxx-base-uncased']:
+            model = TFXxxModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/templates/adding_a_new_model/tests/modeling_xxx_test.py b/templates/adding_a_new_model/tests/modeling_xxx_test.py
new file mode 100644
index 0000000000..8c0cc3cf32
--- /dev/null
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
@@ -0,0 +1,255 @@
+# coding=utf-8
+# Copyright 2018 XXX Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+
+from transformers import is_torch_available
+
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+if is_torch_available():
+    from transformers import (XxxConfig, XxxModel, XxxForMaskedLM,
+                                        XxxForNextSentencePrediction, XxxForPreTraining,
+                                        XxxForQuestionAnswering, XxxForSequenceClassification,
+                                        XxxForTokenClassification, XxxForMultipleChoice)
+    from transformers.modeling_xxx import XXX_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
+
+
+class XxxModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (XxxModel, XxxForMaskedLM, XxxForQuestionAnswering,
+                         XxxForSequenceClassification,
+                         XxxForTokenClassification) if is_torch_available() else ()
+
+    class XxxModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = XxxConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range)
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_xxx_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = XxxModel(config=config)
+            model.eval()
+            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+            sequence_output, pooled_output = model(input_ids, token_type_ids=token_type_ids)
+            sequence_output, pooled_output = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output,
+                "pooled_output": pooled_output,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
+
+
+        def create_and_check_xxx_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = XxxForMaskedLM(config=config)
+            model.eval()
+            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.check_loss_output(result)
+
+
+        def create_and_check_xxx_for_question_answering(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = XxxForQuestionAnswering(config=config)
+            model.eval()
+            loss, start_logits, end_logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids,
+                                                   start_positions=sequence_labels, end_positions=sequence_labels)
+            result = {
+                "loss": loss,
+                "start_logits": start_logits,
+                "end_logits": end_logits,
+            }
+            self.parent.assertListEqual(
+                list(result["start_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.parent.assertListEqual(
+                list(result["end_logits"].size()),
+                [self.batch_size, self.seq_length])
+            self.check_loss_output(result)
+
+
+        def create_and_check_xxx_for_sequence_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = XxxForSequenceClassification(config)
+            model.eval()
+            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=sequence_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.num_labels])
+            self.check_loss_output(result)
+
+
+        def create_and_check_xxx_for_token_classification(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            config.num_labels = self.num_labels
+            model = XxxForTokenClassification(config=config)
+            model.eval()
+            loss, logits = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, labels=token_labels)
+            result = {
+                "loss": loss,
+                "logits": logits,
+            }
+            self.parent.assertListEqual(
+                list(result["logits"].size()),
+                [self.batch_size, self.seq_length, self.num_labels])
+            self.check_loss_output(result)
+
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = XxxModelTest.XxxModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=XxxConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_xxx_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xxx_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xxx_for_masked_lm(*config_and_inputs)
+
+    def test_for_question_answering(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xxx_for_question_answering(*config_and_inputs)
+
+    def test_for_sequence_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xxx_for_sequence_classification(*config_and_inputs)
+
+    def test_for_token_classification(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_xxx_for_token_classification(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(XXX_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = XxxModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/templates/adding_a_new_model/tests/tokenization_xxx_test.py b/templates/adding_a_new_model/tests/tokenization_xxx_test.py
new file mode 100644
index 0000000000..116083edc8
--- /dev/null
+++ b/templates/adding_a_new_model/tests/tokenization_xxx_test.py
@@ -0,0 +1,57 @@
+# coding=utf-8
+# Copyright 2018 XXX Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+from io import open
+
+from transformers.tokenization_bert import (XxxTokenizer, VOCAB_FILES_NAMES)
+
+from .tokenization_tests_commons import CommonTestCases
+
+class XxxTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = XxxTokenizer
+
+    def setUp(self):
+        super(XxxTokenizationTest, self).setUp()
+
+        vocab_tokens = [
+            "[UNK]", "[CLS]", "[SEP]", "want", "##want", "##ed", "wa", "un", "runn",
+            "##ing", ",", "low", "lowest",
+        ]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        with open(self.vocab_file, "w", encoding='utf-8') as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_tokenizer(self, **kwargs):
+        return XxxTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"UNwant\u00E9d,running"
+        output_text = u"unwanted, running"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize(u"UNwant\u00E9d,running")
+        self.assertListEqual(tokens, ["un", "##want", "##ed", ",", "runn", "##ing"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens), [7, 4, 5, 10, 8, 9])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py
new file mode 100644
index 0000000000..1b1325aab5
--- /dev/null
+++ b/templates/adding_a_new_model/tokenization_xxx.py
@@ -0,0 +1,218 @@
+# coding=utf-8
+# Copyright 2018 XXX Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization class for model XXX."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import logging
+import os
+import unicodedata
+from io import open
+
+from .tokenization_utils import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+####################################################
+# In this template, replace all the XXX (various casings) with your model name
+####################################################
+
+####################################################
+# Mapping from the keyword arguments names of Tokenizer `__init__`
+# to file names for serializing Tokenizer instances
+####################################################
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+
+####################################################
+# Mapping from the keyword arguments names of Tokenizer `__init__`
+# to pretrained vocabulary URL for all the model shortcut names.
+####################################################
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'xxx-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-base-uncased-vocab.txt",
+        'xxx-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/xxx-large-uncased-vocab.txt",
+    }
+}
+
+####################################################
+# Mapping from model shortcut names to max length of inputs
+####################################################
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'xxx-base-uncased': 512,
+    'xxx-large-uncased': 512,
+}
+
+####################################################
+# Mapping from model shortcut names to a dictionary of additional
+# keyword arguments for Tokenizer `__init__`.
+# To be used for checkpoint specific configurations.
+####################################################
+PRETRAINED_INIT_CONFIGURATION = {
+    'xxx-base-uncased': {'do_lower_case': True},
+    'xxx-large-uncased': {'do_lower_case': True},
+}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip('\n')
+        vocab[token] = index
+    return vocab
+
+
+class XxxTokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a XxxTokenizer.
+    :class:`~transformers.XxxTokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
+
+    Args:
+        vocab_file: Path to a one-wordpiece-per-line vocabulary file
+        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, do_lower_case=True,
+                 unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
+                 mask_token="[MASK]", **kwargs):
+        """Constructs a XxxTokenizer.
+
+        Args:
+            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
+            **do_lower_case**: (`optional`) boolean (default True)
+                Whether to lower case the input
+                Only has an effect when do_basic_tokenize=True
+        """
+        super(XxxTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
+                                           pad_token=pad_token, cls_token=cls_token,
+                                           mask_token=mask_token, **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = XxxTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def _tokenize(self, text):
+        """ Take as input a string and return a list of strings (tokens) for words/sub-words
+        """
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = ' '.join(tokens).replace(' ##', '').strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+            single sequence: [CLS] X [SEP]
+            pair of sequences: [CLS] A [SEP] B [SEP]
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+        Args:
+            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+                for sequence pairs
+            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                special tokens for the model
+
+        Returns:
+            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError("You should not supply a second sequence if the provided sequence of "
+                                 "ids is already formated with special tokens for the model.")
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence
+
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a directory or file."""
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+        else:
+            vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
+                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+        return (vocab_file,)

From 328a86d2af6b2c4d4b57cbed9f0a6eb0b3ad6256 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 30 Oct 2019 11:37:55 +0100
Subject: [PATCH 133/144] adding links to the templates in readme and
 contributing

---
 CONTRIBUTING.md | 4 ++++
 README.md       | 1 +
 2 files changed, 5 insertions(+)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 817ba56aaf..69f6af78ca 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -62,6 +62,8 @@ Awesome! Please provide the following information:
 If you are willing to contribute the model yourself, let us know so we can best
 guide you.
 
+We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find then in the [`templates`](./templates) folder.
+
 ### Do you want a new feature (that is not a model)?
 
 A world-class feature request addresses the following points:
@@ -81,6 +83,8 @@ A world-class feature request addresses the following points:
 If your issue is well written we're already 80% of the way there by the time you
 post it.
 
+We have added **templates** to guide you in the process of adding a new example script for training or testing the models in the library. You can find then in the [`templates`](./templates) folder.
+
 ## Start contributing! (Pull Requests)
 
 Before writing code, we strongly advise you to search through the exising PRs or
diff --git a/README.md b/README.md
index ecba50a74e..12167e6e9c 100644
--- a/README.md
+++ b/README.md
@@ -122,6 +122,7 @@ At some point in the future, you'll be able to seamlessly move from pre-training
 7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation).
 9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
+10. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
 
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
 

From cef2a8f9002dbee18ede7762a3595722468c6de3 Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Wed, 30 Oct 2019 12:25:31 +0100
Subject: [PATCH 134/144] Update CONTRIBUTING.md

Co-Authored-By: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 69f6af78ca..db70cc644e 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -62,7 +62,7 @@ Awesome! Please provide the following information:
 If you are willing to contribute the model yourself, let us know so we can best
 guide you.
 
-We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find then in the [`templates`](./templates) folder.
+We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder.
 
 ### Do you want a new feature (that is not a model)?
 

From 55fbfea369d9b43cbceeb362806ef4ca56b88acc Mon Sep 17 00:00:00 2001
From: Thomas Wolf <thomwolf@users.noreply.github.com>
Date: Wed, 30 Oct 2019 12:25:40 +0100
Subject: [PATCH 135/144] Update CONTRIBUTING.md

Co-Authored-By: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
---
 CONTRIBUTING.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index db70cc644e..136ef8df81 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -83,7 +83,7 @@ A world-class feature request addresses the following points:
 If your issue is well written we're already 80% of the way there by the time you
 post it.
 
-We have added **templates** to guide you in the process of adding a new example script for training or testing the models in the library. You can find then in the [`templates`](./templates) folder.
+We have added **templates** to guide you in the process of adding a new example script for training or testing the models in the library. You can find them in the [`templates`](./templates) folder.
 
 ## Start contributing! (Pull Requests)
 

From 3f07cd419ce973b4ca8fd4e12fe664d08408b343 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Wed, 30 Oct 2019 15:09:53 +0100
Subject: [PATCH 136/144] update test on Bert to include decoder mode

---
 transformers/tests/modeling_bert_test.py | 50 +++++++++++++++++++++++-
 1 file changed, 49 insertions(+), 1 deletion(-)

diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
index 6c39c4e4db..67be910a7e 100644
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -22,7 +22,7 @@ import pytest
 
 from transformers import is_torch_available
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
 from .configuration_common_test import ConfigTester
 
 if is_torch_available():
@@ -120,10 +120,20 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 attention_probs_dropout_prob=self.attention_probs_dropout_prob,
                 max_position_embeddings=self.max_position_embeddings,
                 type_vocab_size=self.type_vocab_size,
+                is_decoder=False,
                 initializer_range=self.initializer_range)
 
             return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
 
+        def prepare_config_and_inputs_for_decoder(self):
+            config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels = self.prepare_config_and_inputs()
+
+            config.is_decoder = True
+            encoder_hidden_states = floats_tensor([self.batch_size, self.seq_length, self.hidden_size])
+            encoder_attention_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask
+
         def check_loss_output(self, result):
             self.parent.assertListEqual(
                 list(result["loss"].size()),
@@ -145,6 +155,22 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 [self.batch_size, self.seq_length, self.hidden_size])
             self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
 
+        def create_and_check_bert_model_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
+            model = BertModel(config)
+            model.eval()
+            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
+            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, encoder_hidden_states=encoder_hidden_states)
+            sequence_output, pooled_output = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids)
+
+            result = {
+                "sequence_output": sequence_output,
+                "pooled_output": pooled_output,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
+
         def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForMaskedLM(config=config)
             model.eval()
@@ -158,6 +184,20 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 [self.batch_size, self.seq_length, self.vocab_size])
             self.check_loss_output(result)
 
+        def create_and_check_bert_model_for_masked_lm_as_decoder(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, encoder_hidden_states, encoder_attention_mask):
+            model = BertForMaskedLM(config=config)
+            model.eval()
+            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states, encoder_attention_mask=encoder_attention_mask)
+            loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, encoder_hidden_states=encoder_hidden_states)
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.check_loss_output(result)
+
         def create_and_check_bert_for_next_sequence_prediction(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
             model = BertForNextSentencePrediction(config=config)
             model.eval()
@@ -273,10 +313,18 @@ class BertModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bert_model(*config_and_inputs)
 
+    def test_bert_model_as_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_bert_model_as_decoder(*config_and_inputs)
+
     def test_for_masked_lm(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bert_for_masked_lm(*config_and_inputs)
 
+    def test_for_masked_lm_decoder(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs_for_decoder()
+        self.model_tester.create_and_check_bert_model_for_masked_lm_as_decoder(*config_and_inputs)
+
     def test_for_multiple_choice(self):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_bert_for_multiple_choice(*config_and_inputs)

From a88a0e4413d8de5ad235a211fb3b0326aadc5ce0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Wed, 30 Oct 2019 16:06:29 +0100
Subject: [PATCH 137/144] add tests to encoder-decoder model

---
 transformers/tests/modeling_common_test.py    | 16 ++++++
 .../tests/modeling_encoder_decoder_test.py    | 52 +++++++++++++++++++
 2 files changed, 68 insertions(+)
 create mode 100644 transformers/tests/modeling_encoder_decoder_test.py

diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index 2b66757c28..1c1794550c 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -704,6 +704,22 @@ def ids_tensor(shape, vocab_size, rng=None, name=None):
     return torch.tensor(data=values, dtype=torch.long).view(shape).contiguous()
 
 
+def floats_tensor(shape, scale=1.0, rng=None, name=None):
+    """Creates a random float32 tensor of the shape within the vocab size."""
+    if rng is None:
+        rng = global_rng
+
+    total_dims = 1
+    for dim in shape:
+        total_dims *= dim
+
+    values = []
+    for _ in range(total_dims):
+        values.append(rng.random() * scale)
+
+    return torch.tensor(data=values, dtype=torch.float).view(shape).contiguous()
+
+
 class ModelUtilsTest(unittest.TestCase):
     def test_model_from_pretrained(self):
         logging.basicConfig(level=logging.INFO)
diff --git a/transformers/tests/modeling_encoder_decoder_test.py b/transformers/tests/modeling_encoder_decoder_test.py
new file mode 100644
index 0000000000..1ffd0ebc4c
--- /dev/null
+++ b/transformers/tests/modeling_encoder_decoder_test.py
@@ -0,0 +1,52 @@
+# coding=utf-8
+# Copyright 2018 The Hugging Face Inc. Team
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+import unittest
+import pytest
+
+from transformers import is_torch_available
+
+if is_torch_available():
+    from transformers import BertModel, BertForMaskedLM, Model2Model
+    from transformers.modeling_bert import BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
+
+
+class EncoderDecoderModelTest(unittest.TestCase):
+    def test_model2model_from_pretrained(self):
+        logging.basicConfig(level=logging.INFO)
+        for model_name in list(BERT_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = Model2Model.from_pretrained(model_name)
+            self.assertIsInstance(model.encoder, BertModel)
+            self.assertIsInstance(model.decoder, BertForMaskedLM)
+            self.assertEqual(model.decoder.config.is_decoder, True)
+            self.assertEqual(model.encoder.config.is_decoder, False)
+
+    def test_model2model_from_pretrained_not_bert(self):
+        logging.basicConfig(level=logging.INFO)
+        with self.assertRaises(ValueError):
+            _ = Model2Model.from_pretrained('roberta')
+
+        with self.assertRaises(ValueError):
+            _ = Model2Model.from_pretrained('distilbert')
+
+        with self.assertRaises(ValueError):
+            _ = Model2Model.from_pretrained('does-not-exist')
+
+
+if __name__ == "__main__":
+    unittest.main()

From 3cf2020c6becb3fb9c8f3c6db684184b5cdb2ac3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Wed, 30 Oct 2019 16:27:51 +0100
Subject: [PATCH 138/144] change kwargs processing

---
 transformers/modeling_encoder_decoder.py | 71 ++++++++++++++----------
 1 file changed, 43 insertions(+), 28 deletions(-)

diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py
index 162e2f8b3b..a884abd0a2 100644
--- a/transformers/modeling_encoder_decoder.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -114,23 +114,28 @@ class PreTrainedEncoderDecoder(nn.Module):
         # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
         # that apply to the model as a whole.
         # We let the specific kwargs override the common ones in case of conflict.
-        kwargs_encoder = {
-            argument[len("encoder_"):]: value
-            for argument, value in kwargs.items()
-            if argument.startswith("encoder_")
-        }
-        kwargs_decoder = {
-            argument[len("decoder_"):]: value
-            for argument, value in kwargs.items()
-            if argument.startswith("decoder_")
-        }
         kwargs_common = {
             argument: value
             for argument, value in kwargs.items()
-            if not (argument.startswith("encoder_") or argument.startswith("decoder_"))
+            if not argument.startswith("encoder_")
+            and not argument.startswith("decoder_")
         }
-        kwargs_decoder = dict(kwargs_common, **kwargs_decoder)
-        kwargs_encoder = dict(kwargs_common, **kwargs_encoder)
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_encoder.update(
+            {
+                argument[len("encoder_") :]: value
+                for argument, value in kwargs.items()
+                if argument.startswith("encoder_")
+            }
+        )
+        kwargs_decoder.update(
+            {
+                argument[len("decoder_") :]: value
+                for argument, value in kwargs.items()
+                if argument.startswith("decoder_")
+            }
+        )
 
         # Load and initialize the encoder and decoder
         # The distinction between encoder and decoder at the model level is made
@@ -185,35 +190,44 @@ class PreTrainedEncoderDecoder(nn.Module):
         # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
         # that apply to the model as whole.
         # We let the specific kwargs override the common ones in case of conflict.
-        kwargs_encoder = {
-            argument[len("encoder_"):]: value
-            for argument, value in kwargs.items()
-            if argument.startswith("encoder_")
-        }
-        kwargs_decoder = {
-            argument[len("decoder_"):]: value
-            for argument, value in kwargs.items()
-            if argument.startswith("decoder_")
-        }
         kwargs_common = {
             argument: value
             for argument, value in kwargs.items()
-            if not (argument.startswith("encoder_") or argument.startswith("decoder_"))
+            if not argument.startswith("encoder_")
+            and not argument.startswith("decoder_")
         }
-        kwargs_decoder = dict(kwargs_common, **kwargs_decoder)
-        kwargs_encoder = dict(kwargs_common, **kwargs_encoder)
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_encoder.update(
+            {
+                argument[len("encoder_") :]: value
+                for argument, value in kwargs.items()
+                if argument.startswith("encoder_")
+            }
+        )
+        kwargs_decoder.update(
+            {
+                argument[len("decoder_") :]: value
+                for argument, value in kwargs.items()
+                if argument.startswith("decoder_")
+            }
+        )
 
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
         if encoder_hidden_states is None:
             encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
-            encoder_hidden_states = encoder_outputs[0]  # output the last layer hidden state
+            encoder_hidden_states = encoder_outputs[
+                0
+            ]  # output the last layer hidden state
         else:
             encoder_outputs = ()
 
         # Decode
         kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
-        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
+        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get(
+            "attention_mask", None
+        )
         decoder_outputs = self.decoder(decoder_input_ids, **kwargs_decoder)
 
         return decoder_outputs + encoder_outputs
@@ -235,6 +249,7 @@ class Model2Model(PreTrainedEncoderDecoder):
             decoder = BertForMaskedLM(config)
             model = Model2Model.from_pretrained('bert-base-uncased', decoder_model=decoder)
     """
+
     def __init__(self, *args, **kwargs):
         super(Model2Model, self).__init__(*args, **kwargs)
         self.tie_weights()

From fa735208c96c18283b8d2f3fcbfc3157bbd12b1e Mon Sep 17 00:00:00 2001
From: Victor SANH <victorsanh@gmail.com>
Date: Wed, 30 Oct 2019 14:27:28 -0400
Subject: [PATCH 139/144] update readme - fix example command distil*

---
 examples/distillation/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/distillation/README.md b/examples/distillation/README.md
index 7da1ad015b..8efd1ea6f4 100644
--- a/examples/distillation/README.md
+++ b/examples/distillation/README.md
@@ -108,7 +108,7 @@ python train.py \
     --student_config training_configs/distilbert-base-uncased.json \
     --teacher_type bert \
     --teacher_name bert-base-uncased \
-    --alpha_ce 5.0 --alpha_mlm 2.0 --alpha_cos 1.0 --mlm \
+    --alpha_ce 5.0 --alpha_mlm 2.0 --alpha_cos 1.0 --alpha_clm 0.0 --mlm \
     --freeze_pos_embs \
     --dump_path serialization_dir/my_first_training \
     --data_file data/binarized_text.bert-base-uncased.pickle \
@@ -144,7 +144,7 @@ python -m torch.distributed.launch \
         --student_config training_configs/distilbert-base-uncased.json \
         --teacher_type bert \
         --teacher_name bert-base-uncased \
-        --alpha_ce 0.33 --alpha_mlm 0.33 --alpha_cos 0.33 --mlm \
+        --alpha_ce 0.33 --alpha_mlm 0.33 --alpha_cos 0.33 --alpha_clm 0.0 --mlm \
         --freeze_pos_embs \
         --dump_path serialization_dir/my_first_training \
         --data_file data/binarized_text.bert-base-uncased.pickle \
@@ -166,4 +166,4 @@ If you find the ressource useful, you should cite the following paper:
   booktitle={NeurIPS EMC^2 Workshop},
   year={2019}
 }
-```
\ No newline at end of file
+```

From ac29353abe4ba30a488c14cbfb897ed41dc0fa05 Mon Sep 17 00:00:00 2001
From: cregouby <cregouby@users.noreply.github.com>
Date: Thu, 31 Oct 2019 10:04:40 +0100
Subject: [PATCH 140/144] Fix
 https://github.com/huggingface/transformers/issues/1673

---
 transformers/modeling_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index c88cab143a..03490630ed 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -284,6 +284,7 @@ class PreTrainedModel(nn.Module):
                 pretrained_model_name_or_path, *model_args,
                 cache_dir=cache_dir, return_unused_kwargs=True,
                 force_download=force_download,
+                proxies=proxies,
                 **kwargs
             )
         else:

From f96ce1c24151349251880c95e9a9fb144b62367c Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Thu, 31 Oct 2019 18:27:11 +0000
Subject: [PATCH 141/144] [run_generation] Fix generation with batch_size>1

---
 examples/run_generation.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/examples/run_generation.py b/examples/run_generation.py
index b8cc8a9bbf..2d917660cf 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -79,7 +79,7 @@ def set_seed(args):
 def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
     """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
         Args:
-            logits: logits distribution shape (vocabulary size)
+            logits: logits distribution shape (batch size x vocabulary size)
             top_k > 0: keep only top k tokens with highest probability (top-k filtering).
             top_p > 0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                 Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
@@ -138,13 +138,14 @@ def sample_sequence(model, length, context, num_samples=1, temperature=1, top_k=
             outputs = model(**inputs)  # Note: we could also use 'past' with GPT-2/Transfo-XL/XLNet/CTRL (cached hidden-states)
             next_token_logits = outputs[0][:, -1, :] / (temperature if temperature > 0 else 1.)
 
-            # reptition penalty from CTRL (https://arxiv.org/abs/1909.05858)
-            for _ in set(generated.view(-1).tolist()):
-                next_token_logits[_] /= repetition_penalty
+            # repetition penalty from CTRL (https://arxiv.org/abs/1909.05858)
+            for i in range(num_samples):
+                for _ in set(generated[i].tolist()):
+                    next_token_logits[i, _] /= repetition_penalty
                 
             filtered_logits = top_k_top_p_filtering(next_token_logits, top_k=top_k, top_p=top_p)
-            if temperature == 0: #greedy sampling:
-                next_token = torch.argmax(filtered_logits).unsqueeze(0)
+            if temperature == 0: # greedy sampling:
+                next_token = torch.argmax(filtered_logits, dim=-1).unsqueeze(-1)
             else:
                 next_token = torch.multinomial(F.softmax(filtered_logits, dim=-1), num_samples=1)
             generated = torch.cat((generated, next_token), dim=1)

From be36cf92fb96de1766b092c282c7609dcbe4a8e7 Mon Sep 17 00:00:00 2001
From: Timothy Liu <tlkh.xms@gmail.com>
Date: Wed, 30 Oct 2019 00:44:23 +0000
Subject: [PATCH 142/144] Added mixed precision support to benchmarks.py

---
 examples/benchmarks.py | 18 ++++++++++++++----
 1 file changed, 14 insertions(+), 4 deletions(-)

diff --git a/examples/benchmarks.py b/examples/benchmarks.py
index 06f368d946..26c260b9ec 100644
--- a/examples/benchmarks.py
+++ b/examples/benchmarks.py
@@ -253,18 +253,22 @@ def create_setup_and_compute(model_names: List[str],
                              average_over: int = 3,
                              torchscript: bool = False,
                              xla: bool = False,
+                             amp: bool = False,
+                             fp16: bool = False,
                              save_to_csv: bool = False,
                              csv_filename: str = f"results_{round(time())}.csv"):
     if xla:
         tf.config.optimizer.set_jit(True)
+    if amp:
+        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})
 
     if tensorflow:
         dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_tensorflow(model_names, dictionary, average_over)
+        results = _compute_tensorflow(model_names, dictionary, average_over, amp)
     else:
         device = 'cuda' if (gpu and torch.cuda.is_available()) else 'cpu'
         dictionary = {model_name: {} for model_name in model_names}
-        results = _compute_pytorch(model_names, dictionary, average_over, device, torchscript)
+        results = _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16)
 
     print("=========== RESULTS ===========")
     for model_name in model_names:
@@ -302,7 +306,7 @@ def create_setup_and_compute(model_names: List[str],
                 writer.writerow({'model': model_name, **model_results})
 
 
-def _compute_pytorch(model_names, dictionary, average_over, device, torchscript):
+def _compute_pytorch(model_names, dictionary, average_over, device, torchscript, fp16):
     for c, model_name in enumerate(model_names):
         print(f"{c + 1} / {len(model_names)}")
         config = AutoConfig.from_pretrained(model_name, torchscript=torchscript)
@@ -319,6 +323,8 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript)
         dictionary[model_name]["results"] = {i: {} for i in batch_sizes}
 
         for batch_size in batch_sizes:
+            if fp16:
+                model.half()
             model.to(device)
             model.eval()
             for slice_size in slice_sizes:
@@ -346,7 +352,7 @@ def _compute_pytorch(model_names, dictionary, average_over, device, torchscript)
     return dictionary
 
 
-def _compute_tensorflow(model_names, dictionary, average_over):
+def _compute_tensorflow(model_names, dictionary, average_over, amp):
     for c, model_name in enumerate(model_names):
         print(f"{c + 1} / {len(model_names)}")
         config = AutoConfig.from_pretrained(model_name)
@@ -409,6 +415,8 @@ def main():
                                                                                   "the correct dependencies are "
                                                                                   "installed")
     parser.add_argument("--xla", required=False, action="store_true", help="TensorFlow only: use XLA acceleration.")
+    parser.add_argument("--amp", required=False, action="store_true", help="TensorFlow only: use automatic mixed precision acceleration.")
+    parser.add_argument("--fp16", required=False, action="store_true", help="PyTorch only: use FP16 to accelerate inference.")
     parser.add_argument("--keras_predict", required=False, action="store_true", help="Whether to use model.predict "
                                                                                      "instead of model() to do a "
                                                                                      "forward pass.")
@@ -442,6 +450,7 @@ def main():
                 tensorflow=False,
                 gpu=args.torch_cuda,
                 torchscript=args.torchscript,
+                fp16=args.fp16,
                 save_to_csv=args.save_to_csv,
                 csv_filename=args.csv_filename,
                 average_over=args.average_over
@@ -455,6 +464,7 @@ def main():
                 model_names=args.models,
                 tensorflow=True,
                 xla=args.xla,
+                amp=args.amp,
                 save_to_csv=args.save_to_csv,
                 csv_filename=args.csv_filename,
                 average_over=args.average_over

From 1a2b40cb53477b94c66718bac8d997297fcc8043 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Thu, 31 Oct 2019 18:00:51 -0400
Subject: [PATCH 143/144] run_tf_glue MRPC evaluation only for MRPC

---
 examples/run_tf_glue.py | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

diff --git a/examples/run_tf_glue.py b/examples/run_tf_glue.py
index 73173b0cf1..8878ce726e 100644
--- a/examples/run_tf_glue.py
+++ b/examples/run_tf_glue.py
@@ -71,20 +71,21 @@ history = model.fit(train_dataset, epochs=EPOCHS, steps_per_epoch=train_steps,
 os.makedirs('./save/', exist_ok=True)
 model.save_pretrained('./save/')
 
-# Load the TensorFlow model in PyTorch for inspection
-pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
+if TASK == "mrpc":
+    # Load the TensorFlow model in PyTorch for inspection
+    pytorch_model = BertForSequenceClassification.from_pretrained('./save/', from_tf=True)
 
-# Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
-sentence_0 = 'This research was consistent with his findings.'
-sentence_1 = 'His findings were compatible with this research.'
-sentence_2 = 'His findings were not compatible with this research.'
-inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
-inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
+    # Quickly test a few predictions - MRPC is a paraphrasing task, let's see if our model learned the task
+    sentence_0 = 'This research was consistent with his findings.'
+    sentence_1 = 'His findings were compatible with this research.'
+    sentence_2 = 'His findings were not compatible with this research.'
+    inputs_1 = tokenizer.encode_plus(sentence_0, sentence_1, add_special_tokens=True, return_tensors='pt')
+    inputs_2 = tokenizer.encode_plus(sentence_0, sentence_2, add_special_tokens=True, return_tensors='pt')
 
-del inputs_1["special_tokens_mask"]
-del inputs_2["special_tokens_mask"]
+    del inputs_1["special_tokens_mask"]
+    del inputs_2["special_tokens_mask"]
 
-pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
-pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
-print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
-print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')
+    pred_1 = pytorch_model(**inputs_1)[0].argmax().item()
+    pred_2 = pytorch_model(**inputs_2)[0].argmax().item()
+    print('sentence_1 is', 'a paraphrase' if pred_1 else 'not a paraphrase', 'of sentence_0')
+    print('sentence_2 is', 'a paraphrase' if pred_2 else 'not a paraphrase', 'of sentence_0')

From 93d2fff0716d83df168ca0686d16bc4cd7ccb366 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Fri, 1 Nov 2019 09:47:38 -0400
Subject: [PATCH 144/144] Close #1654

---
 docs/source/model_doc/ctrl.rst | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/docs/source/model_doc/ctrl.rst b/docs/source/model_doc/ctrl.rst
index 9fd5b4acdb..36b37b3ee1 100644
--- a/docs/source/model_doc/ctrl.rst
+++ b/docs/source/model_doc/ctrl.rst
@@ -1,6 +1,11 @@
 CTRL
 ----------------------------------------------------
 
+Note: if you fine-tune a CTRL model using the Salesforce code (https://github.com/salesforce/ctrl),
+you'll be able to convert from TF to our HuggingFace/Transformers format using the 
+``convert_tf_to_huggingface_pytorch.py`` script (see `issue #1654 <https://github.com/huggingface/transformers/issues/1654>`_).
+
+
 ``CTRLConfig``
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~