From 60a5babd57dd80f855df859abf006ee4488ff639 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 5 Nov 2019 12:01:23 +0100
Subject: [PATCH 001/110] adding files

---
 transformers/configuration_t5.py              | 130 +++++
 ...rt_t5_original_tf_checkpoint_to_pytorch.py |  65 +++
 transformers/modeling_t5.py                   | 373 +++++++++++++
 transformers/modeling_tf_t5.py                | 496 ++++++++++++++++++
 transformers/tokenization_t5.py               | 214 ++++++++
 5 files changed, 1278 insertions(+)
 create mode 100644 transformers/configuration_t5.py
 create mode 100755 transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
 create mode 100644 transformers/modeling_t5.py
 create mode 100644 transformers/modeling_tf_t5.py
 create mode 100644 transformers/tokenization_t5.py

diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py
new file mode 100644
index 0000000000..a37a5b2157
--- /dev/null
+++ b/transformers/configuration_t5.py
@@ -0,0 +1,130 @@
+# coding=utf-8
+# Copyright 2010, The T5 Authors and HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" T5 model configuration """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import sys
+import six
+from io import open
+
+from .configuration_utils import PretrainedConfig
+
+logger = logging.getLogger(__name__)
+
+T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-config.json",
+    't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-config.json",
+}
+
+
+class T5Config(PretrainedConfig):
+    r"""
+        :class:`~transformers.T5Config` is the configuration class to store the configuration of a
+        `T5Model`.
+
+
+        Arguments:
+            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `T5Model`.
+            hidden_size: Size of the encoder layers and the pooler layer.
+            num_hidden_layers: Number of hidden layers in the Transformer encoder.
+            num_attention_heads: Number of attention heads for each attention layer in
+                the Transformer encoder.
+            intermediate_size: The size of the "intermediate" (i.e., feed-forward)
+                layer in the Transformer encoder.
+            hidden_act: The non-linear activation function (function or string) in the
+                encoder and pooler. If string, "gelu", "relu", "swish" and "gelu_new" are supported.
+            hidden_dropout_prob: The dropout probabilitiy for all fully connected
+                layers in the embeddings, encoder, and pooler.
+            attention_probs_dropout_prob: The dropout ratio for the attention
+                probabilities.
+            max_position_embeddings: The maximum sequence length that this model might
+                ever be used with. Typically set this to something large just in case
+                (e.g., 512 or 1024 or 2048).
+            type_vocab_size: The vocabulary size of the `token_type_ids` passed into
+                `T5Model`.
+            initializer_range: The sttdev of the truncated_normal_initializer for
+                initializing all weight matrices.
+            layer_norm_eps: The epsilon used by LayerNorm.
+    """
+    pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+    def __init__(self,
+                 vocab_size_or_config_json_file=50257,
+                 n_positions=1024,
+                 n_ctx=1024,
+                 n_embd=768,
+                 n_layer=12,
+                 n_head=12,
+                 resid_pdrop=0.1,
+                 embd_pdrop=0.1,
+                 attn_pdrop=0.1,
+                 layer_norm_epsilon=1e-5,
+                 initializer_range=0.02,
+
+                 num_labels=1,
+                 summary_type='cls_index',
+                 summary_use_proj=True,
+                 summary_activation=None,
+                 summary_proj_to_labels=True,
+                 summary_first_dropout=0.1,
+                 **kwargs):
+        super(T5Config, self).__init__(**kwargs)
+        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, six.string_types) else -1
+        self.n_ctx = n_ctx
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+
+        self.num_labels = num_labels
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
+        if isinstance(vocab_size_or_config_json_file, six.string_types):
+            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+                json_config = json.loads(reader.read())
+            for key, value in json_config.items():
+                self.__dict__[key] = value
+        elif not isinstance(vocab_size_or_config_json_file, int):
+            raise ValueError(
+                "First argument must be either a vocabulary size (int)"
+                "or the path to a pretrained model config file (str)"
+            )
+
+    @property
+    def max_position_embeddings(self):
+        return self.n_positions
+
+    @property
+    def hidden_size(self):
+        return self.n_embd
+
+    @property
+    def num_attention_heads(self):
+        return self.n_head
+
+    @property
+    def num_hidden_layers(self):
+        return self.n_layer
diff --git a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
new file mode 100755
index 0000000000..608027ebac
--- /dev/null
+++ b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
@@ -0,0 +1,65 @@
+# coding=utf-8
+# Copyright 2018 The T5 authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert T5 checkpoint."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import torch
+
+from transformers import T5Config, T5ForPreTraining, load_tf_weights_in_t5
+
+import logging
+logging.basicConfig(level=logging.INFO)
+
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, t5_config_file, pytorch_dump_path):
+    # Initialise PyTorch model
+    config = T5Config.from_json_file(t5_config_file)
+    print("Building PyTorch model from configuration: {}".format(str(config)))
+    model = T5ForPreTraining(config)
+
+    # Load weights from tf checkpoint
+    load_tf_weights_in_t5(model, config, tf_checkpoint_path)
+
+    # Save pytorch-model
+    print("Save PyTorch model to {}".format(pytorch_dump_path))
+    torch.save(model.state_dict(), pytorch_dump_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--tf_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the TensorFlow checkpoint path.")
+    parser.add_argument("--t5_config_file",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "The config json file corresponding to the pre-trained T5 model. \n"
+                            "This specifies the model architecture.")
+    parser.add_argument("--pytorch_dump_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
+                                     args.t5_config_file,
+                                     args.pytorch_dump_path)
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
new file mode 100644
index 0000000000..fa3c22f24b
--- /dev/null
+++ b/transformers/modeling_t5.py
@@ -0,0 +1,373 @@
+# coding=utf-8
+# Copyright 2018 T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch T5 model. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from .modeling_utils import PreTrainedModel, prune_linear_layer
+from .configuration_t5 import T5Config
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+####################################################
+# This dict contrains shortcut names and associated url
+# for the pretrained weights provided with the models
+####################################################
+T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-pytorch_model.bin",
+    't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-pytorch_model.bin",
+}
+
+####################################################
+# This is a conversion method from TF 1.0 to PyTorch
+# More details: https://medium.com/huggingface/from-tensorflow-to-pytorch-265f40ef2a28
+####################################################
+def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
+    """ Load tf checkpoints in a pytorch model.
+    """
+    try:
+        import re
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error("Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions.")
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info("Converting TensorFlow checkpoint from {}".format(tf_path))
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info("Loading TF weight {} with shape {}".format(name, shape))
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split('/')
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
+            logger.info("Skipping {}".format("/".join(name)))
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
+                l = re.split(r'_(\d+)', m_name)
+            else:
+                l = [m_name]
+            if l[0] == 'kernel' or l[0] == 'gamma':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'output_bias' or l[0] == 'beta':
+                pointer = getattr(pointer, 'bias')
+            elif l[0] == 'output_weights':
+                pointer = getattr(pointer, 'weight')
+            elif l[0] == 'squad':
+                pointer = getattr(pointer, 'classifier')
+            else:
+                try:
+                    pointer = getattr(pointer, l[0])
+                except AttributeError:
+                    logger.info("Skipping {}".format("/".join(name)))
+                    continue
+            if len(l) >= 2:
+                num = int(l[1])
+                pointer = pointer[num]
+        if m_name[-11:] == '_embeddings':
+            pointer = getattr(pointer, 'weight')
+        elif m_name == 'kernel':
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info("Initialize PyTorch weight {}".format(name))
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+####################################################
+# PyTorch Models are constructed by sub-classing
+# - torch.nn.Module for the layers and
+# - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
+####################################################
+
+class T5Layer(nn.Module):
+    def __init__(self, config):
+        super(T5Layer, self).__init__()
+        self.attention = T5Attention(config)
+        self.intermediate = T5Intermediate(config)
+        self.output = T5Output(config)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+
+class T5PreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = T5Config
+    pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_t5
+    base_model_prefix = "transformer"
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+T5_START_DOCSTRING = r"""    The T5 model was proposed in
+    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
+    by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
+    It's an encoder decoder pre-trained transformer.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
+        https://arxiv.org/abs/1910.10683
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+T5_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+
+            T5 is a model with relative position embeddings so you should be able to pad the inputs on
+            the right or the left.
+
+            Indices can be obtained using :class:`transformers.T5Tokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare single stack (encoder or decoder) of a T5 Model transformer outputting raw hidden-states"
+                      "without any specific head on top.",
+                      T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+class T5Model(T5PreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = T5Tokenizer.from_pretrained('t5-base-uncased')
+        model = T5Model.from_pretrained('t5-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config):
+        super(T5Model, self).__init__(config)
+
+        self.embeddings = T5Embeddings(config)
+        self.encoder = T5Encoder(config)
+        self.pooler = T5Pooler(config)
+
+        self.init_weights()
+
+    @property
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embeddings.word_embeddings = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """ Prunes heads of the model.
+            heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+            See base class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        ##################################
+        # Replace this with your model code
+        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+        encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)
+        sequence_output = encoder_outputs[0]
+        outputs = (sequence_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+
+        return outputs  # sequence_output, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
+    T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+class T5WithLMHead(T5PreTrainedModel):
+    r"""
+        **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = T5Tokenizer.from_pretrained('t5-base-uncased')
+        model = T5ForMaskedLM.from_pretrained('t5-base-uncased')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, lm_labels=input_ids)
+        loss, prediction_scores = outputs[:2]
+
+    """
+    def __init__(self, config):
+        super(T5ForMaskedLM, self).__init__(config)
+
+        self.transformer = T5Model(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
+
+        self.init_weights()
+
+    def get_output_embeddings(self):
+        return self.lm_head
+
+    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
+                lm_labels=None):
+
+        outputs = self.transformer(input_ids,
+                            attention_mask=attention_mask,
+                            token_type_ids=token_type_ids,
+                            position_ids=position_ids, 
+                            head_mask=head_mask)
+
+        sequence_output = outputs[0]
+        lm_logits = self.cls(sequence_output)
+
+        outputs = (lm_logits,) + outputs[2:]  # Add hidden states and attention if they are here
+        if lm_labels is not None:
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = lm_labels[..., 1:].contiguous()
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
+                            shift_labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (lm_loss), lm_logits, (hidden_states), (attentions)
diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
new file mode 100644
index 0000000000..deb453846c
--- /dev/null
+++ b/transformers/modeling_tf_t5.py
@@ -0,0 +1,496 @@
+# coding=utf-8
+# Copyright 2018 T5 Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" TF 2.0 T5 model. """
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import json
+import logging
+import math
+import os
+import sys
+from io import open
+
+import numpy as np
+import tensorflow as tf
+
+from .configuration_t5 import T5Config
+from .modeling_tf_utils import TFPreTrainedModel, get_initializer
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+####################################################
+# This dict contrains shortcut names and associated url
+# for the pretrained weights provided with the models
+####################################################
+TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-tf_model.h5",
+    't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-tf_model.h5",
+}
+
+####################################################
+# TF 2.0 Models are constructed using Keras imperative API by sub-classing
+# - tf.keras.layers.Layer for the layers and
+# - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model)
+####################################################
+
+####################################################
+# Here is an example of typical layer in a TF 2.0 model of the library
+# The classes are usually identical to the PyTorch ones and prefixed with 'TF'.
+#
+# Note that class __init__ parameters includes **kwargs (send to 'super').
+# This let us have a control on class scope and variable names:
+# More precisely, we set the names of the class attributes (lower level layers) to
+# to the equivalent attributes names in the PyTorch model so we can have equivalent
+# class and scope structure between PyTorch and TF 2.0 models and easily load one in the other.
+#
+# See the conversion methods in modeling_tf_pytorch_utils.py for more details
+####################################################
+class TFT5Layer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFT5Layer, self).__init__(**kwargs)
+        self.attention = TFT5Attention(config, name='attention')
+        self.intermediate = TFT5Intermediate(config, name='intermediate')
+        self.transformer_output = TFT5Output(config, name='output')
+
+    def call(self, inputs, training=False):
+        hidden_states, attention_mask, head_mask = inputs
+
+        attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
+        attention_output = attention_outputs[0]
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.transformer_output([intermediate_output, attention_output], training=training)
+        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+####################################################
+# The full model without a specific pretrained or finetuning head is
+# provided as a tf.keras.layers.Layer usually called "TFT5MainLayer"
+####################################################
+class TFT5MainLayer(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFT5MainLayer, self).__init__(**kwargs)
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
+
+    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
+        # We allow three types of multi-inputs:
+        # - traditional keyword arguments in the call method
+        # - all the arguments provided as a dict in the first positional argument of call
+        # - all the arguments provided as a list/tuple (ordered) in the first positional argument of call
+        # The last two options are useful to use the tf.keras fit() method.
+
+        if isinstance(inputs, (tuple, list)):
+            input_ids = inputs[0]
+            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
+            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
+            position_ids = inputs[3] if len(inputs) > 3 else position_ids
+            head_mask = inputs[4] if len(inputs) > 4 else head_mask
+            assert len(inputs) <= 5, "Too many inputs."
+        elif isinstance(inputs, dict):
+            input_ids = inputs.get('input_ids')
+            attention_mask = inputs.get('attention_mask', attention_mask)
+            token_type_ids = inputs.get('token_type_ids', token_type_ids)
+            position_ids = inputs.get('position_ids', position_ids)
+            head_mask = inputs.get('head_mask', head_mask)
+            assert len(inputs) <= 5, "Too many inputs."
+        else:
+            input_ids = inputs
+
+        if attention_mask is None:
+            attention_mask = tf.fill(tf.shape(input_ids), 1)
+        if token_type_ids is None:
+            token_type_ids = tf.fill(tf.shape(input_ids), 0)
+
+        # We create a 3D attention mask from a 2D tensor mask.
+        # Sizes are [batch_size, 1, 1, to_seq_length]
+        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+        # this attention mask is more simple than the triangular masking of causal attention
+        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+
+        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if not head_mask is None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_hidden_layers
+            # head_mask = tf.constant([0] * self.num_hidden_layers)
+
+        ##################################
+        # Replace this with your model code
+        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
+        encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
+        sequence_output = encoder_outputs[0]
+        outputs = (sequence_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+
+        return outputs  # sequence_output, (hidden_states), (attentions)
+
+
+####################################################
+# TFT5PreTrainedModel is a sub-class of tf.keras.Model
+# which take care of loading and saving pretrained weights
+# and various common utilities.
+# Here you just need to specify a few (self-explanatory)
+# pointers for your model.
+####################################################
+class TFT5PreTrainedModel(TFPreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = T5Config
+    pretrained_model_archive_map = TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "transformer"
+
+
+T5_START_DOCSTRING = r"""    The XXX model was proposed in
+    `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
+    by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
+    pre-trained using a combination of masked language modeling objective and next sentence prediction
+    on a large corpus comprising the Toronto Book Corpus and Wikipedia.
+
+    This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
+    refer to the TF 2.0 documentation for all matter related to general usage and behavior.
+
+    .. _`XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`:
+        https://arxiv.org/abs/1810.04805
+
+    .. _`tf.keras.Model`:
+        https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
+
+    Note on the model inputs:
+        TF 2.0 models accepts two formats as inputs:
+
+            - having all inputs as keyword arguments (like PyTorch models), or
+            - having all inputs as a list, tuple or dict in the first positional arguments.
+
+        This second option is usefull when using `tf.keras.Model.fit()` method which currently requires having all the tensors in the first argument of the model call function: `model(inputs)`.
+
+        If you choose this second option, there are three possibilities you can use to gather all the input Tensors in the first positional argument :
+
+        - a single Tensor with input_ids only and nothing else: `model(inputs_ids)
+        - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+            `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+        - a dictionary with one or several input Tensors associaed to the input names given in the docstring:
+            `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
+
+    Parameters:
+        config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+XXX_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, XXX input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
+                
+                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+                
+                ``token_type_ids:   0   0   0   0  0     0   0``
+
+            Xxx is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
+            Indices can be obtained using :class:`transformers.XxxTokenizer`.
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Segment token indices to indicate first and second portions of the inputs.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1]``.
+        **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.",
+                      XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class TFXxxModel(TFXxxPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Xxx pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XxxTokenizer, TFXxxModel
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = TFXxxModel.from_pretrained('xxx-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXxxModel, self).__init__(config, *inputs, **kwargs)
+        self.transformer = TFXxxMainLayer(config, name='transformer')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+        return outputs
+
+
+@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """,
+    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class TFXxxForMaskedLM(TFXxxPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XxxTokenizer, TFXxxForMaskedLM
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = TFXxxForMaskedLM.from_pretrained('xxx-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        prediction_scores = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXxxForMaskedLM, self).__init__(config, *inputs, **kwargs)
+
+        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name='mlm')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+
+        return outputs  # prediction_scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks. """,
+    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class TFXxxForSequenceClassification(TFXxxPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **logits**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XxxTokenizer, TFXxxForSequenceClassification
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = TFXxxForSequenceClassification.from_pretrained('xxx-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        logits = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXxxForSequenceClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='classifier')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
+        logits = self.classifier(pooled_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # logits, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Xxx Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class TFXxxForTokenClassification(TFXxxPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XxxTokenizer, TFXxxForTokenClassification
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = TFXxxForTokenClassification.from_pretrained('xxx-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        scores = outputs[0]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXxxForTokenClassification, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+        self.classifier = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='classifier')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
+
+        return outputs  # scores, (hidden_states), (attentions)
+
+
+@add_start_docstrings("""Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
+    the hidden-states output to compute `span start logits` and `span end logits`). """,
+    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
+class TFXxxForQuestionAnswering(TFXxxPreTrainedModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **start_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-start scores (before SoftMax).
+        **end_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
+            Span-end scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        import tensorflow as tf
+        from transformers import XxxTokenizer, TFXxxForQuestionAnswering
+
+        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
+        model = TFXxxForQuestionAnswering.from_pretrained('xxx-base-uncased')
+        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
+        outputs = model(input_ids)
+        start_scores, end_scores = outputs[:2]
+
+    """
+    def __init__(self, config, *inputs, **kwargs):
+        super(TFXxxForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+
+        self.transformer = TFXxxMainLayer(config, name='transformer')
+        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
+                                                kernel_initializer=get_initializer(config.initializer_range),
+                                                name='qa_outputs')
+
+    def call(self, inputs, **kwargs):
+        outputs = self.transformer(inputs, **kwargs)
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = tf.split(logits, 2, axis=-1)
+        start_logits = tf.squeeze(start_logits, axis=-1)
+        end_logits = tf.squeeze(end_logits, axis=-1)
+
+        outputs = (start_logits, end_logits,) + outputs[2:]
+
+        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py
new file mode 100644
index 0000000000..3f8f4bf556
--- /dev/null
+++ b/transformers/tokenization_t5.py
@@ -0,0 +1,214 @@
+# coding=utf-8
+# Copyright 2018 T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Tokenization class for model T5."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import logging
+import os
+import unicodedata
+from io import open
+
+from .tokenization_utils import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+####################################################
+# Mapping from the keyword arguments names of Tokenizer `__init__`
+# to file names for serializing Tokenizer instances
+####################################################
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+
+####################################################
+# Mapping from the keyword arguments names of Tokenizer `__init__`
+# to pretrained vocabulary URL for all the model shortcut names.
+####################################################
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-vocab.txt",
+        't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-vocab.txt",
+    }
+}
+
+####################################################
+# Mapping from model shortcut names to max length of inputs
+####################################################
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    't5-base-uncased': 512,
+    't5-large-uncased': 512,
+}
+
+####################################################
+# Mapping from model shortcut names to a dictionary of additional
+# keyword arguments for Tokenizer `__init__`.
+# To be used for checkpoint specific configurations.
+####################################################
+PRETRAINED_INIT_CONFIGURATION = {
+    't5-base-uncased': {'do_lower_case': True},
+    't5-large-uncased': {'do_lower_case': True},
+}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip('\n')
+        vocab[token] = index
+    return vocab
+
+
+class T5Tokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a T5Tokenizer.
+    :class:`~transformers.T5Tokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
+
+    Args:
+        vocab_file: Path to a one-wordpiece-per-line vocabulary file
+        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, do_lower_case=True,
+                 unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
+                 mask_token="[MASK]", **kwargs):
+        """Constructs a T5Tokenizer.
+
+        Args:
+            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
+            **do_lower_case**: (`optional`) boolean (default True)
+                Whether to lower case the input
+                Only has an effect when do_basic_tokenize=True
+        """
+        super(T5Tokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
+                                           pad_token=pad_token, cls_token=cls_token,
+                                           mask_token=mask_token, **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def _tokenize(self, text):
+        """ Take as input a string and return a list of strings (tokens) for words/sub-words
+        """
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                for sub_token in self.wordpiece_tokenizer.tokenize(token):
+                    split_tokens.append(sub_token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = ' '.join(tokens).replace(' ##', '').strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A BERT sequence has the following format:
+            single sequence: [CLS] X [SEP]
+            pair of sequences: [CLS] A [SEP] B [SEP]
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+        Args:
+            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+                for sequence pairs
+            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                special tokens for the model
+
+        Returns:
+            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError("You should not supply a second sequence if the provided sequence of "
+                                 "ids is already formated with special tokens for the model.")
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A BERT sequence pair mask has the following format:
+        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence
+
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    def save_vocabulary(self, vocab_path):
+        """Save the tokenizer vocabulary to a directory or file."""
+        index = 0
+        if os.path.isdir(vocab_path):
+            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
+        else:
+            vocab_file = vocab_path
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
+                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
+                    index = token_index
+                writer.write(token + u'\n')
+                index += 1
+        return (vocab_file,)

From 568c0ffb7ef73555567f8bd467cf80c2b1e6ac13 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 5 Nov 2019 16:40:29 +0100
Subject: [PATCH 002/110] adding T5 model

---
 transformers/modeling_encoder_decoder.py |   4 +-
 transformers/modeling_t5.py              | 471 ++++++++++++++++++++---
 2 files changed, 412 insertions(+), 63 deletions(-)

diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py
index a884abd0a2..713cf5252e 100644
--- a/transformers/modeling_encoder_decoder.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -217,9 +217,7 @@ class PreTrainedEncoderDecoder(nn.Module):
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
         if encoder_hidden_states is None:
             encoder_outputs = self.encoder(encoder_input_ids, **kwargs_encoder)
-            encoder_hidden_states = encoder_outputs[
-                0
-            ]  # output the last layer hidden state
+            encoder_hidden_states = encoder_outputs[0]
         else:
             encoder_outputs = ()
 
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index fa3c22f24b..d93e96211d 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -1,5 +1,5 @@
 # coding=utf-8
-# Copyright 2018 T5 Authors and HuggingFace Inc. team.
+# Copyright 2018 Mesh TensorFlow authors, T5 Authors and HuggingFace Inc. team.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -20,11 +20,14 @@ import json
 import logging
 import math
 import os
+import math
 import sys
+import itertools
 from io import open
 
 import torch
 from torch import nn
+import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
 from .modeling_utils import PreTrainedModel, prune_linear_layer
@@ -119,31 +122,389 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
 # - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
 ####################################################
 
-class T5Layer(nn.Module):
+class T5DenseReluDense(nn.Module):
     def __init__(self, config):
-        super(T5Layer, self).__init__()
-        self.attention = T5Attention(config)
-        self.intermediate = T5Intermediate(config)
-        self.output = T5Output(config)
+        super(T5DenseReluDense, self).__init__()
+        self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
+        self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
+        self.dropout = nn.Dropout(config.dropout)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None):
-        attention_outputs = self.attention(hidden_states, attention_mask, head_mask)
-        attention_output = attention_outputs[0]
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.output(intermediate_output, attention_output)
-        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+    def forward(self, hidden_states):
+        h = self.wi(hidden_states)
+        h = F.relu(h)
+        h = self.dropout(h)
+        h = self.wo(h)
+        return h
+
+
+class T5LayerFF(nn.Module):
+    def __init__(self, config):
+        super(T5LayerFF, self).__init__()
+        self.DenseReluDense = T5DenseReluDense(config)
+        self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def forward(self, hidden_states):
+        norm_x = self.layer_norm(hidden_states)
+        y = self.DenseReluDense(norm_x)
+        layer_output = hidden_states + self.dropout(y)
+        return layer_output
+
+
+class T5Attention(nn.Module):
+    NEW_ID = itertools.count()
+
+    def __init__(self, config):
+        super(T5Attention, self).__init__()
+        self.layer_id = next(T5Attention.NEW_ID)
+
+        self.output_attentions = config.output_attentions
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.dim = config.d_model
+        self.n_heads = config.num_heads
+        self.dropout = config.dropout_rate
+        assert self.dim % self.n_heads == 0
+
+        self.q = nn.Linear(self.dim, self.dim, bias=False)
+        self.k = nn.Linear(self.dim, self.dim, bias=False)
+        self.v = nn.Linear(self.dim, self.dim, bias=False)
+        self.o = nn.Linear(self.dim, self.dim, bias=False)
+
+        self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        attention_head_size = self.dim // self.n_heads
+        if len(heads) == 0:
+            return
+        mask = torch.ones(self.n_heads, attention_head_size)
+        heads = set(heads) - self.pruned_heads
+        for head in heads:
+            head -= sum(1 if h < head else 0 for h in self.pruned_heads)
+            mask[head] = 0
+        mask = mask.view(-1).contiguous().eq(1)
+        index = torch.arange(len(mask))[mask].long()
+        # Prune linear layers
+        self.q = prune_linear_layer(self.q, index)
+        self.k = prune_linear_layer(self.k, index)
+        self.v = prune_linear_layer(self.v, index)
+        self.o = prune_linear_layer(self.o, index, dim=1)
+        # Update hyper params
+        self.n_heads = self.n_heads - len(heads)
+        self.dim = attention_head_size * self.n_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @staticmethod
+    def _relative_position_bucket(relative_position,
+                                  bidirectional=True,
+                                  num_buckets=32,
+                                  max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention.
+        The relative position is defined as memory_position - query_position, i.e.
+        the distance in tokens from the attending position to the attended-to
+        position.  If bidirectional=False, then positive relative positions are
+        invalid.
+        We use smaller buckets for small absolute relative_position and larger buckets
+        for larger absolute relative_positions.  All relative positions >=max_distance
+        map to the same bucket.  All relative positions <=-max_distance map to the
+        same bucket.  This should allow for more graceful generalization to longer
+        sequences than the model has been trained on.
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32
+            values in the range [0, num_buckets)
+        """
+        ret = 0
+        n = -relative_position
+        if bidirectional:
+            num_buckets //= 2
+            ret += (n < 0).to(torch.long) * num_buckets  # mtf.to_int32(mtf.less(n, 0)) * num_buckets
+            n = torch.abs(n)
+        else:
+            n = torch.max(n, 0)
+        # now n is in the range [0, inf)
+
+        # half of the buckets are for exact increments in positions
+        max_exact = num_buckets // 2
+        is_small = (n < max_exact)
+
+        # The other half of the buckets are for logarithmically bigger bins in positions up to max_distance
+        val_if_large = max_exact + (
+            torch.log(n.float() / max_exact)
+            / math.log(max_distance / max_exact) * (num_buckets - max_exact)).to(torch.long)
+        val_if_large = torch.min(val_if_large, num_buckets - 1)
+
+        ret += torch.where(is_small, n, val_if_large)
+        return ret
+
+    def compute_bias(self, qlen, klen):
+        """ Compute binned relative position bias """
+        context_position = torch.arange(qlen, dtype=torch.long)[:, None]
+        memory_position = torch.arange(klen, dtype=torch.long)[None, :]
+        relative_position = memory_position - context_position  # shape (qlen, klen)
+        rp_bucket = self._relative_position_bucket(relative_position,
+                                                   bidirectional=not self.is_decoder,
+                                                   num_buckets=self.relative_attention_num_buckets)
+        values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
+        values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen)
+        return values
+
+    def forward(self, input, mask, kv=None, position_bias=None, cache=None, head_mask=None):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = input.size()
+        if kv is None:
+            klen = qlen if cache is None else cache['slen'] + qlen
+        else:
+            klen = kv.size(1)
+        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
+        n_heads = self.n_heads
+        dim_per_head = self.dim // n_heads
+        mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)
+
+        def shape(x):
+            """  projection """
+            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
+
+        def unshape(x):
+            """  compute context """
+            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
+
+        q = shape(self.q(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        if kv is None:
+            k = shape(self.k(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(input))                                      # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k(k))                                          # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(v))                                          # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = torch.cat([k_, k], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                    v = torch.cat([v_, v], dim=2)                             # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+            cache[self.layer_id] = (k, v)
+
+        # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
+        scores = torch.matmul(q, k.transpose(2, 3))                           # (bs, n_heads, qlen, klen)
+
+        if position_bias is None:
+            position_bias = self.compute_bias(qlen, klen)
+        scores += position_bias
+
+        mask = (mask == 0).view(mask_reshape).expand_as(scores)               # (bs, n_heads, qlen, klen)
+        scores.masked_fill_(mask, -float('inf'))                              # (bs, n_heads, qlen, klen)
+
+        weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
+        weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = torch.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)                                            # (bs, qlen, dim)
+
+        context = self.o(context)
+
+        outputs = (context,)
+        if self.output_attentions:
+            outputs = outputs + (weights,)
         return outputs
 
 
+class T5LayerSelfAttention(nn.Module):
+    def __init__(self, config):
+        super(T5LayerSelfAttention, self).__init__()
+        self.SelfAttention = T5Attention(config)
+        self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout)
 
-class T5PreTrainedModel(PreTrainedModel):
+    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+        norm_x = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(norm_x,
+                                              attention_mask=attention_mask,
+                                              head_mask=head_mask)
+        y = attention_output[0]
+        layer_output = hidden_states + self.dropout(y)
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5LayerCrossAttention(nn.Module):
+    def __init__(self, config):
+        super(T5LayerCrossAttention, self).__init__()
+        self.EncDecAttention = T5Attention(config)
+        self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def forward(self, hidden_states, kv, attention_mask=None, head_mask=None):
+        norm_x = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(norm_x,
+                                                kv=kv,
+                                                attention_mask=attention_mask,
+                                                head_mask=head_mask)
+        y = attention_output[0]
+        layer_output = hidden_states + self.dropout(y)
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class T5Block(nn.Module):
+    def __init__(self, config):
+        super(T5Block, self).__init__()
+        self.is_decoder = config.is_decoder
+        self.layer_000 = T5LayerSelfAttention(config)
+        if self.is_decoder:
+            self.layer_001 = T5LayerCrossAttention(config)
+            self.layer_002 = T5LayerFF(config)
+        else:
+            self.layer_001 = T5LayerFF(config)
+
+    def forward(self, hidden_states, attention_mask=None,
+                encoder_hidden_states=None, encoder_attention_mask=None, head_mask=None):
+        self_attention_outputs = self.layer_000(hidden_states,
+                                                attention_mask=attention_mask,
+                                                head_mask=head_mask)
+        hidden_states = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]
+
+        if self.is_decoder:
+            cross_attention_outputs = self.layer_001(hidden_states,
+                                                     kv=encoder_hidden_states,
+                                                     attention_mask=encoder_attention_mask,
+                                                     head_mask=head_mask)
+            hidden_states = cross_attention_outputs[0]
+            outputs = cross_attention_outputs[1:] + outputs
+            hidden_states = self.layer_002(hidden_states)
+        else:
+            hidden_states = self.layer_001(hidden_states)
+
+        outputs = (hidden_states,) + outputs  # add attentions if we output them
+        return outputs
+
+
+class T5Stack(nn.Module):
+    def __init__(self, config):
+        super(T5Stack, self).__init__()
+        self.blocks = nn.ModuleList([T5Block(config) for _ in range(config.num_layers)])
+        self.final_layer_norm = nn.LayerNorm(config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout)
+
+    def forward(self,
+                hidden_states,
+                attention_mask=None,
+                encoder_hidden_states=None,
+                encoder_attention_mask=None,
+                head_mask=None):
+
+        if attention_mask is None:
+            attention_mask = torch.ones_like(input_ids)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+
+        # Provided a padding mask of dimensions [batch_size, seq_length]
+        # - if the model is a decoder, apply a causal mask in addition to the padding mask
+        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if attention_mask.dim() == 2:
+            if self.config.is_decoder:
+                batch_size, seq_length = input_ids.size()
+                seq_ids = torch.arange(seq_length, device=input_ids.device)
+                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # If a 2D ou 3D attention mask is provided for the cross-attention
+        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_attention_mask.dim() == 3:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+        if encoder_attention_mask.dim() == 2:
+            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
+                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
+            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.config.num_hidden_layers
+
+        all_hidden_states = ()
+        all_attentions = ()
+        position_bias = None
+        for i, layer_module in enumerate(self.layer):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states,
+                                         attention_mask=extended_attention_mask,
+                                         encoder_hidden_states=encoder_hidden_states,
+                                         encoder_attention_mask=encoder_extended_attention_mask,
+                                         head_mask=head_mask[i])
+            hidden_states = layer_outputs[0]
+
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        layer_output = self.dropout(hidden_states)
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+
+
+class T5PreTrainedModel(PreTrainedEncoderDecoder):
     """ An abstract class to handle weights initialization and
         a simple interface for dowloading and loading pretrained models.
     """
     config_class = T5Config
     pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
     load_tf_weights = load_tf_weights_in_t5
-    base_model_prefix = "transformer"
 
     def _init_weights(self, module):
         """ Initialize the weights """
@@ -238,19 +599,23 @@ class T5Model(T5PreTrainedModel):
     """
     def __init__(self, config):
         super(T5Model, self).__init__(config)
+        self.shared = nn.Embeddings(config.vocab_size, config.d_model)
 
-        self.embeddings = T5Embeddings(config)
-        self.encoder = T5Encoder(config)
-        self.pooler = T5Pooler(config)
+        encoder_config = copy.deepcopy(config)
+        self.encoder = T5Stack(encoder_config)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        self.decoder = T5Stack(decoder_config)
 
         self.init_weights()
 
     @property
     def get_input_embeddings(self):
-        return self.embeddings.word_embeddings
+        return self.shared
 
     def set_input_embeddings(self, new_embeddings):
-        self.embeddings.word_embeddings = new_embeddings
+        self.shared = new_embeddings
 
     def _prune_heads(self, heads_to_prune):
         """ Prunes heads of the model.
@@ -260,50 +625,36 @@ class T5Model(T5PreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None):
-        if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
-        if token_type_ids is None:
-            token_type_ids = torch.zeros_like(input_ids)
+    def forward(self, encoder_input_ids, decoder_input_ids, **kwargs):
+        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
+        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
+        # that apply to the model as whole.
+        # We let the specific kwargs override the common ones in case of conflict.
+        kwargs_common = dict((k, v) for k, v in kwargs.items()
+                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
 
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
-
-        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
-        # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
-        # Since we are adding it to the raw scores before the softmax, this is
-        # effectively the same as removing these entirely.
-        extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype) # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
-
-        # Prepare head mask if needed
-        # 1.0 in head_mask indicate we keep the head
-        # attention_probs has shape bsz x n_heads x N x N
-        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
-        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
-        if head_mask is not None:
-            if head_mask.dim() == 1:
-                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
-            elif head_mask.dim() == 2:
-                head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
-            head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
+        # Encode if needed (training, first prediction pass)
+        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
+        if encoder_hidden_states is None:
+            encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+            hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
+            encoder_hidden_states = encoder_outputs[0]
         else:
-            head_mask = [None] * self.config.num_hidden_layers
+            encoder_outputs = ()
 
-        ##################################
-        # Replace this with your model code
-        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
-        encoder_outputs = self.encoder(embedding_output, extended_attention_mask, head_mask=head_mask)
-        sequence_output = encoder_outputs[0]
-        outputs = (sequence_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        # Decode
+        decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+        hidden_states = self.shared(decoder_inputs_ids)  # Convert inputs in embeddings
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
+        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
+        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
 
-        return outputs  # sequence_output, (hidden_states), (attentions)
+        return decoder_outputs + encoder_outputs
 
 
 @add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
@@ -342,7 +693,7 @@ class T5WithLMHead(T5PreTrainedModel):
         super(T5ForMaskedLM, self).__init__(config)
 
         self.transformer = T5Model(config)
-        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size)
 
         self.init_weights()
 

From 88e5bef58f34dca87f28ab489fdecbeaaef8b316 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 5 Nov 2019 17:02:52 +0100
Subject: [PATCH 003/110] share position biases

---
 transformers/modeling_t5.py | 65 +++++++++++++++++++++++--------------
 1 file changed, 40 insertions(+), 25 deletions(-)

diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index d93e96211d..e1a1d019ff 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -154,9 +154,10 @@ class T5LayerFF(nn.Module):
 class T5Attention(nn.Module):
     NEW_ID = itertools.count()
 
-    def __init__(self, config):
+    def __init__(self, config, has_relative_attention_bias=False):
         super(T5Attention, self).__init__()
         self.layer_id = next(T5Attention.NEW_ID)
+        self.has_relative_attention_bias = has_relative_attention_bias
 
         self.output_attentions = config.output_attentions
         self.relative_attention_num_buckets = config.relative_attention_num_buckets
@@ -170,7 +171,8 @@ class T5Attention(nn.Module):
         self.v = nn.Linear(self.dim, self.dim, bias=False)
         self.o = nn.Linear(self.dim, self.dim, bias=False)
 
-        self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
         self.pruned_heads = set()
 
     def prune_heads(self, heads):
@@ -304,6 +306,8 @@ class T5Attention(nn.Module):
         scores = torch.matmul(q, k.transpose(2, 3))                           # (bs, n_heads, qlen, klen)
 
         if position_bias is None:
+            if not self.has_relative_attention_bias:
+                raise ValueError("No position_bias provided and no weights to compute position_bias")
             position_bias = self.compute_bias(qlen, klen)
         scores += position_bias
 
@@ -325,20 +329,23 @@ class T5Attention(nn.Module):
         outputs = (context,)
         if self.output_attentions:
             outputs = outputs + (weights,)
+        if self.has_relative_attention_bias:
+            outputs = outputs + (position_bias,)
         return outputs
 
 
 class T5LayerSelfAttention(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, has_relative_attention_bias=False):
         super(T5LayerSelfAttention, self).__init__()
-        self.SelfAttention = T5Attention(config)
+        self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
         self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon)
         self.dropout = nn.Dropout(config.dropout)
 
-    def forward(self, hidden_states, attention_mask=None, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None):
         norm_x = self.layer_norm(hidden_states)
         attention_output = self.SelfAttention(norm_x,
                                               attention_mask=attention_mask,
+                                              position_bias=position_bias,
                                               head_mask=head_mask)
         y = attention_output[0]
         layer_output = hidden_states + self.dropout(y)
@@ -347,17 +354,18 @@ class T5LayerSelfAttention(nn.Module):
 
 
 class T5LayerCrossAttention(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, has_relative_attention_bias=False):
         super(T5LayerCrossAttention, self).__init__()
-        self.EncDecAttention = T5Attention(config)
+        self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
         self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon)
         self.dropout = nn.Dropout(config.dropout)
 
-    def forward(self, hidden_states, kv, attention_mask=None, head_mask=None):
+    def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None):
         norm_x = self.layer_norm(hidden_states)
         attention_output = self.EncDecAttention(norm_x,
                                                 kv=kv,
                                                 attention_mask=attention_mask,
+                                                position_bias=position_bias,
                                                 head_mask=head_mask)
         y = attention_output[0]
         layer_output = hidden_states + self.dropout(y)
@@ -366,20 +374,22 @@ class T5LayerCrossAttention(nn.Module):
 
 
 class T5Block(nn.Module):
-    def __init__(self, config):
+    def __init__(self, config, has_relative_attention_bias=False):
         super(T5Block, self).__init__()
         self.is_decoder = config.is_decoder
-        self.layer_000 = T5LayerSelfAttention(config)
+        self.layer_000 = T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
         if self.is_decoder:
-            self.layer_001 = T5LayerCrossAttention(config)
+            self.layer_001 = T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias)
             self.layer_002 = T5LayerFF(config)
         else:
             self.layer_001 = T5LayerFF(config)
 
-    def forward(self, hidden_states, attention_mask=None,
-                encoder_hidden_states=None, encoder_attention_mask=None, head_mask=None):
+    def forward(self, hidden_states, attention_mask=None, position_bias=None,
+                encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
+                head_mask=None):
         self_attention_outputs = self.layer_000(hidden_states,
                                                 attention_mask=attention_mask,
+                                                position_bias=position_bias,
                                                 head_mask=head_mask)
         hidden_states = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]
@@ -388,6 +398,7 @@ class T5Block(nn.Module):
             cross_attention_outputs = self.layer_001(hidden_states,
                                                      kv=encoder_hidden_states,
                                                      attention_mask=encoder_attention_mask,
+                                                     position_bias=encoder_decoder_position_bias,
                                                      head_mask=head_mask)
             hidden_states = cross_attention_outputs[0]
             outputs = cross_attention_outputs[1:] + outputs
@@ -402,7 +413,8 @@ class T5Block(nn.Module):
 class T5Stack(nn.Module):
     def __init__(self, config):
         super(T5Stack, self).__init__()
-        self.blocks = nn.ModuleList([T5Block(config) for _ in range(config.num_layers)])
+        self.blocks = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0))
+                                     for i in range(config.num_layers)])
         self.final_layer_norm = nn.LayerNorm(config.layer_norm_epsilon)
         self.dropout = nn.Dropout(config.dropout)
 
@@ -413,8 +425,12 @@ class T5Stack(nn.Module):
                 encoder_attention_mask=None,
                 head_mask=None):
 
+        batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1]
+        encoder_seq_length = encoder_hidden_states.shape[1] if encoder_hidden_states is not None else 0
         if attention_mask is None:
-            attention_mask = torch.ones_like(input_ids)
+            attention_mask = torch.ones(batch_size, seq_length).to(hidden_states.device)
+        if encoder_attention_mask is None:
+            encoder_attention_mask = torch.ones(batch_size, encoder_seq_length).to(hidden_states.device)
 
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
         # ourselves in which case we just need to make it broadcastable to all heads.
@@ -426,8 +442,7 @@ class T5Stack(nn.Module):
         # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
         if attention_mask.dim() == 2:
             if self.config.is_decoder:
-                batch_size, seq_length = input_ids.size()
-                seq_ids = torch.arange(seq_length, device=input_ids.device)
+                seq_ids = torch.arange(seq_length, device=hidden_states.device)
                 causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
                 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
             else:
@@ -469,16 +484,22 @@ class T5Stack(nn.Module):
         all_hidden_states = ()
         all_attentions = ()
         position_bias = None
+        encoder_decoder_position_bias = None
         for i, layer_module in enumerate(self.layer):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
             layer_outputs = layer_module(hidden_states,
                                          attention_mask=extended_attention_mask,
+                                         position_bias=position_bias,
                                          encoder_hidden_states=encoder_hidden_states,
                                          encoder_attention_mask=encoder_extended_attention_mask,
+                                         encoder_decoder_position_bias=encoder_decoder_position_bias,
                                          head_mask=head_mask[i])
             hidden_states = layer_outputs[0]
+            if i == 0:
+                position_bias = layer_outputs[2] if len(layer_outputs) > 3 else None
+                encoder_decoder_position_bias = layer_outputs[4] if len(layer_outputs) > 5 else None
 
             if self.output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
@@ -700,14 +721,8 @@ class T5WithLMHead(T5PreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def forward(self, input_ids, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None,
-                lm_labels=None):
-
-        outputs = self.transformer(input_ids,
-                            attention_mask=attention_mask,
-                            token_type_ids=token_type_ids,
-                            position_ids=position_ids, 
-                            head_mask=head_mask)
+    def forward(self, encoder_input_ids, decoder_input_ids, **kwargs):
+        outputs = self.transformer(encoder_input_ids, decoder_input_ids, **kwargs)
 
         sequence_output = outputs[0]
         lm_logits = self.cls(sequence_output)

From 3835e1e651ebeeddaa8dd8cb5f4d30912ec5ec6d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 6 Nov 2019 11:52:29 +0100
Subject: [PATCH 004/110] adding tokenizer

---
 transformers/tokenization_t5.py | 188 +++++++++-----------------------
 1 file changed, 51 insertions(+), 137 deletions(-)

diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py
index 3f8f4bf556..cff6a41baf 100644
--- a/transformers/tokenization_t5.py
+++ b/transformers/tokenization_t5.py
@@ -16,16 +16,15 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import collections
 import logging
 import os
-import unicodedata
-from io import open
 
 from .tokenization_utils import PreTrainedTokenizer
 
 logger = logging.getLogger(__name__)
 
+SPIECE_UNDERLINE = u'▁'
+
 ####################################################
 # Mapping from the keyword arguments names of Tokenizer `__init__`
 # to file names for serializing Tokenizer instances
@@ -39,8 +38,7 @@ VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
 PRETRAINED_VOCAB_FILES_MAP = {
     'vocab_file':
     {
-        't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-vocab.txt",
-        't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-vocab.txt",
+        't5': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
     }
 }
 
@@ -48,167 +46,83 @@ PRETRAINED_VOCAB_FILES_MAP = {
 # Mapping from model shortcut names to max length of inputs
 ####################################################
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    't5-base-uncased': 512,
-    't5-large-uncased': 512,
+    't5': 512,
 }
 
-####################################################
-# Mapping from model shortcut names to a dictionary of additional
-# keyword arguments for Tokenizer `__init__`.
-# To be used for checkpoint specific configurations.
-####################################################
-PRETRAINED_INIT_CONFIGURATION = {
-    't5-base-uncased': {'do_lower_case': True},
-    't5-large-uncased': {'do_lower_case': True},
-}
-
-
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip('\n')
-        vocab[token] = index
-    return vocab
-
-
 class T5Tokenizer(PreTrainedTokenizer):
-    r"""
-    Constructs a T5Tokenizer.
-    :class:`~transformers.T5Tokenizer` runs end-to-end tokenization: punctuation splitting + wordpiece
-
-    Args:
-        vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
     """
+        SentencePiece based tokenizer. Peculiarities:
 
+            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+    """
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
-    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file, do_lower_case=True,
-                 unk_token="[UNK]", sep_token="[SEP]", pad_token="[PAD]", cls_token="[CLS]",
-                 mask_token="[MASK]", **kwargs):
-        """Constructs a T5Tokenizer.
+    def __init__(self, vocab_file, eos_token="</s>", unk_token="<unk>",
+                 pad_token="<pad>", **kwargs):
+        super(T5Tokenizer, self).__init__(eos_token=eos_token, unk_token=unk_token,
+                                          pad_token=pad_token, **kwargs)
 
-        Args:
-            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file
-            **do_lower_case**: (`optional`) boolean (default True)
-                Whether to lower case the input
-                Only has an effect when do_basic_tokenize=True
-        """
-        super(T5Tokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
-                                           pad_token=pad_token, cls_token=cls_token,
-                                           mask_token=mask_token, **kwargs)
-        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
-        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning("You need to install SentencePiece to use T5Tokenizer:"
+                           "https://github.com/google/sentencepiece"
+                           "pip install sentencepiece")
 
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
-                "model use `tokenizer = T5Tokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
-        self.vocab = load_vocab(vocab_file)
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
 
     @property
     def vocab_size(self):
-        return len(self.vocab)
+        return self.sp_model.get_piece_size()
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning("You need to install SentencePiece to use XLNetTokenizer: https://github.com/google/sentencepiece"
+                           "pip install sentencepiece")
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(self.vocab_file)
 
     def _tokenize(self, text):
         """ Take as input a string and return a list of strings (tokens) for words/sub-words
         """
-        split_tokens = []
-        if self.do_basic_tokenize:
-            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
-                for sub_token in self.wordpiece_tokenizer.tokenize(token):
-                    split_tokens.append(sub_token)
-        else:
-            split_tokens = self.wordpiece_tokenizer.tokenize(text)
-        return split_tokens
+        return self.sp_model.EncodeAsPieces(text)
 
     def _convert_token_to_id(self, token):
         """ Converts a token (str/unicode) in an id using the vocab. """
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
+        return self.sp_model.piece_to_id(token)
 
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (string/unicode) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
+        return self.sp_model.id_to_piece(index)
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """
-        out_string = ' '.join(tokens).replace(' ##', '').strip()
+        out_string = self.sp_model.decode_pieces(tokens)
         return out_string
 
-    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+    def save_vocabulary(self, save_directory):
+        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
+            to a directory.
         """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A BERT sequence has the following format:
-            single sequence: [CLS] X [SEP]
-            pair of sequences: [CLS] A [SEP] B [SEP]
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
 
-    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
 
-        Args:
-            token_ids_0: list of ids (must not contain special tokens)
-            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
-                for sequence pairs
-            already_has_special_tokens: (default False) Set to True if the token list is already formated with
-                special tokens for the model
-
-        Returns:
-            A list of integers in the range [0, 1]: 0 for a special token, 1 for a sequence token.
-        """
-
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError("You should not supply a second sequence if the provided sequence of "
-                                 "ids is already formated with special tokens for the model.")
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-
-    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A BERT sequence pair mask has the following format:
-        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
-        | first sequence    | second sequence
-
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-
-    def save_vocabulary(self, vocab_path):
-        """Save the tokenizer vocabulary to a directory or file."""
-        index = 0
-        if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES['vocab_file'])
-        else:
-            vocab_file = vocab_path
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning("Saving vocabulary to {}: vocabulary indices are not consecutive."
-                                   " Please check that the vocabulary is not corrupted!".format(vocab_file))
-                    index = token_index
-                writer.write(token + u'\n')
-                index += 1
-        return (vocab_file,)
+        return (out_vocab_file,)

From 73f2c342f53f2ff02124da23ba029d80c386e7ce Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 6 Nov 2019 11:52:39 +0100
Subject: [PATCH 005/110] fixing template

---
 templates/adding_a_new_model/configuration_xxx.py | 2 +-
 templates/adding_a_new_model/modeling_xxx.py      | 1 -
 2 files changed, 1 insertion(+), 2 deletions(-)

diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py
index b1614e71af..14c1c2c79e 100644
--- a/templates/adding_a_new_model/configuration_xxx.py
+++ b/templates/adding_a_new_model/configuration_xxx.py
@@ -84,7 +84,7 @@ class XxxConfig(PretrainedConfig):
                  summary_first_dropout=0.1,
                  **kwargs):
         super(XxxConfig, self).__init__(**kwargs)
-        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, six.string_types) else -1
+        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
         self.n_ctx = n_ctx
         self.n_positions = n_positions
         self.n_embd = n_embd
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
index ff64f13f40..ee705e753c 100644
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -280,7 +280,6 @@ class XxxModel(XxxPreTrainedModel):
 
         self.init_weights()
 
-    @property
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings
 

From 076a207935bfcc38416cd0baa887d3e025ebef28 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 6 Nov 2019 11:52:50 +0100
Subject: [PATCH 006/110] adding tests and updating model

---
 transformers/__init__.py                   |  11 +-
 transformers/configuration_t5.py           |  53 +++---
 transformers/modeling_t5.py                | 151 ++++++++--------
 transformers/tests/modeling_common_test.py |  32 ++--
 transformers/tests/modeling_t5_test.py     | 176 +++++++++++++++++++
 transformers/tests/modeling_tf_t5_test.py  | 190 +++++++++++++++++++++
 transformers/tests/tokenization_t5_test.py |  77 +++++++++
 7 files changed, 571 insertions(+), 119 deletions(-)
 create mode 100644 transformers/tests/modeling_t5_test.py
 create mode 100644 transformers/tests/modeling_tf_t5_test.py
 create mode 100644 transformers/tests/tokenization_t5_test.py

diff --git a/transformers/__init__.py b/transformers/__init__.py
index 53f3c39dc7..bf896276d6 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -42,6 +42,7 @@ from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
+from .tokenization_t5 import T5Tokenizer
 
 # Configurations
 from .configuration_utils import PretrainedConfig
@@ -52,10 +53,10 @@ from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CON
 from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
 
 # Modeling
 if is_torch_available():
@@ -69,10 +70,10 @@ if is_torch_available():
                                 BertForTokenClassification, BertForQuestionAnswering,
                                 load_tf_weights_in_bert, BERT_PRETRAINED_MODEL_ARCHIVE_MAP)
     from .modeling_openai import (OpenAIGPTPreTrainedModel, OpenAIGPTModel,
-                                OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
-                                load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
+                                  OpenAIGPTLMHeadModel, OpenAIGPTDoubleHeadsModel,
+                                  load_tf_weights_in_openai_gpt, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP)
     from .modeling_transfo_xl import (TransfoXLPreTrainedModel, TransfoXLModel, TransfoXLLMHeadModel,
-                                    load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
+                                      load_tf_weights_in_transfo_xl, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP)
     from .modeling_gpt2 import (GPT2PreTrainedModel, GPT2Model,
                                 GPT2LMHeadModel, GPT2DoubleHeadsModel,
                                 load_tf_weights_in_gpt2, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP)
@@ -95,6 +96,8 @@ if is_torch_available():
                                 DistilBertForSequenceClassification, DistilBertForQuestionAnswering,
                                 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
     from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
+    from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel,
+                              T5_PRETRAINED_MODEL_ARCHIVE_MAP)
 
     # Optimization
     from .optimization import (AdamW, ConstantLRSchedule, WarmupConstantSchedule, WarmupCosineSchedule,
diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py
index a37a5b2157..9db918e59f 100644
--- a/transformers/configuration_t5.py
+++ b/transformers/configuration_t5.py
@@ -64,44 +64,29 @@ class T5Config(PretrainedConfig):
     pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=50257,
-                 n_positions=1024,
-                 n_ctx=1024,
-                 n_embd=768,
-                 n_layer=12,
-                 n_head=12,
-                 resid_pdrop=0.1,
-                 embd_pdrop=0.1,
-                 attn_pdrop=0.1,
-                 layer_norm_epsilon=1e-5,
+                 vocab_size_or_config_json_file=32128,
+                 n_positions=512,
+                 d_model=512,
+                 d_ff=2048,
+                 num_layers=12,
+                 num_heads=12,
+                 relative_attention_num_buckets=32,
+                 dropout_rate=0.1,
+                 layer_norm_epsilon=1e-6,
                  initializer_range=0.02,
-
-                 num_labels=1,
-                 summary_type='cls_index',
-                 summary_use_proj=True,
-                 summary_activation=None,
-                 summary_proj_to_labels=True,
-                 summary_first_dropout=0.1,
                  **kwargs):
         super(T5Config, self).__init__(**kwargs)
-        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, six.string_types) else -1
-        self.n_ctx = n_ctx
+        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
         self.n_positions = n_positions
-        self.n_embd = n_embd
-        self.n_layer = n_layer
-        self.n_head = n_head
-        self.resid_pdrop = resid_pdrop
-        self.embd_pdrop = embd_pdrop
-        self.attn_pdrop = attn_pdrop
+        self.d_model = d_model
+        self.d_ff = d_ff
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        self.relative_attention_num_buckets = relative_attention_num_buckets
+        self.dropout_rate = dropout_rate
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
 
-        self.num_labels = num_labels
-        self.summary_type = summary_type
-        self.summary_use_proj = summary_use_proj
-        self.summary_activation = summary_activation
-        self.summary_first_dropout = summary_first_dropout
-        self.summary_proj_to_labels = summary_proj_to_labels
         if isinstance(vocab_size_or_config_json_file, six.string_types):
             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
                 json_config = json.loads(reader.read())
@@ -119,12 +104,12 @@ class T5Config(PretrainedConfig):
 
     @property
     def hidden_size(self):
-        return self.n_embd
+        return self.d_model
 
     @property
     def num_attention_heads(self):
-        return self.n_head
+        return self.num_heads
 
     @property
     def num_hidden_layers(self):
-        return self.n_layer
+        return self.num_layers
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index e1a1d019ff..ce443cf882 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -20,8 +20,8 @@ import json
 import logging
 import math
 import os
-import math
 import sys
+import copy
 import itertools
 from io import open
 
@@ -30,7 +30,7 @@ from torch import nn
 import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .modeling_utils import PreTrainedModel, prune_linear_layer
+from .modeling_utils import PreTrainedModel
 from .configuration_t5 import T5Config
 from .file_utils import add_start_docstrings
 
@@ -127,7 +127,7 @@ class T5DenseReluDense(nn.Module):
         super(T5DenseReluDense, self).__init__()
         self.wi = nn.Linear(config.d_model, config.d_ff, bias=False)
         self.wo = nn.Linear(config.d_ff, config.d_model, bias=False)
-        self.dropout = nn.Dropout(config.dropout)
+        self.dropout = nn.Dropout(config.dropout_rate)
 
     def forward(self, hidden_states):
         h = self.wi(hidden_states)
@@ -141,8 +141,8 @@ class T5LayerFF(nn.Module):
     def __init__(self, config):
         super(T5LayerFF, self).__init__()
         self.DenseReluDense = T5DenseReluDense(config)
-        self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout)
+        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
 
     def forward(self, hidden_states):
         norm_x = self.layer_norm(hidden_states)
@@ -157,6 +157,7 @@ class T5Attention(nn.Module):
     def __init__(self, config, has_relative_attention_bias=False):
         super(T5Attention, self).__init__()
         self.layer_id = next(T5Attention.NEW_ID)
+        self.is_decoder = config.is_decoder
         self.has_relative_attention_bias = has_relative_attention_bias
 
         self.output_attentions = config.output_attentions
@@ -231,7 +232,7 @@ class T5Attention(nn.Module):
             ret += (n < 0).to(torch.long) * num_buckets  # mtf.to_int32(mtf.less(n, 0)) * num_buckets
             n = torch.abs(n)
         else:
-            n = torch.max(n, 0)
+            n = torch.max(n, torch.zeros_like(n))
         # now n is in the range [0, inf)
 
         # half of the buckets are for exact increments in positions
@@ -242,7 +243,7 @@ class T5Attention(nn.Module):
         val_if_large = max_exact + (
             torch.log(n.float() / max_exact)
             / math.log(max_distance / max_exact) * (num_buckets - max_exact)).to(torch.long)
-        val_if_large = torch.min(val_if_large, num_buckets - 1)
+        val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
 
         ret += torch.where(is_small, n, val_if_large)
         return ret
@@ -259,7 +260,7 @@ class T5Attention(nn.Module):
         values = values.permute([2, 0, 1]).unsqueeze(0) # shape (1, num_heads, qlen, klen)
         return values
 
-    def forward(self, input, mask, kv=None, position_bias=None, cache=None, head_mask=None):
+    def forward(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None):
         """
         Self-attention (if kv is None) or attention over source sentence (provided by kv).
         """
@@ -273,7 +274,6 @@ class T5Attention(nn.Module):
         # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
         n_heads = self.n_heads
         dim_per_head = self.dim // n_heads
-        mask_reshape = (bs, 1, qlen, klen) if mask.dim() == 3 else (bs, 1, 1, klen)
 
         def shape(x):
             """  projection """
@@ -311,8 +311,9 @@ class T5Attention(nn.Module):
             position_bias = self.compute_bias(qlen, klen)
         scores += position_bias
 
-        mask = (mask == 0).view(mask_reshape).expand_as(scores)               # (bs, n_heads, qlen, klen)
-        scores.masked_fill_(mask, -float('inf'))                              # (bs, n_heads, qlen, klen)
+        if mask is not None:
+            mask = (mask == 0).expand_as(scores)                              # (bs, n_heads, qlen, klen)
+            scores.masked_fill_(mask, -float('inf'))                          # (bs, n_heads, qlen, klen)
 
         weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
         weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
@@ -338,13 +339,13 @@ class T5LayerSelfAttention(nn.Module):
     def __init__(self, config, has_relative_attention_bias=False):
         super(T5LayerSelfAttention, self).__init__()
         self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-        self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout)
+        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
 
     def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None):
         norm_x = self.layer_norm(hidden_states)
         attention_output = self.SelfAttention(norm_x,
-                                              attention_mask=attention_mask,
+                                              mask=attention_mask,
                                               position_bias=position_bias,
                                               head_mask=head_mask)
         y = attention_output[0]
@@ -357,14 +358,14 @@ class T5LayerCrossAttention(nn.Module):
     def __init__(self, config, has_relative_attention_bias=False):
         super(T5LayerCrossAttention, self).__init__()
         self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-        self.layer_norm = nn.LayerNorm(config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout)
+        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
 
     def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None):
         norm_x = self.layer_norm(hidden_states)
         attention_output = self.EncDecAttention(norm_x,
+                                                mask=attention_mask,
                                                 kv=kv,
-                                                attention_mask=attention_mask,
                                                 position_bias=position_bias,
                                                 head_mask=head_mask)
         y = attention_output[0]
@@ -410,13 +411,41 @@ class T5Block(nn.Module):
         return outputs
 
 
-class T5Stack(nn.Module):
+class T5PreTrainedModel(PreTrainedModel):
+    """ An abstract class to handle weights initialization and
+        a simple interface for dowloading and loading pretrained models.
+    """
+    config_class = T5Config
+    pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
+    load_tf_weights = load_tf_weights_in_t5
+    base_model_prefix = "transformer"
+
+    def _init_weights(self, module):
+        """ Initialize the weights """
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class T5Stack(T5PreTrainedModel):
     def __init__(self, config):
-        super(T5Stack, self).__init__()
+        super(T5Stack, self).__init__(config)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.is_decoder = config.is_decoder
+
         self.blocks = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0))
                                      for i in range(config.num_layers)])
-        self.final_layer_norm = nn.LayerNorm(config.layer_norm_epsilon)
-        self.dropout = nn.Dropout(config.dropout)
+        self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.dropout = nn.Dropout(config.dropout_rate)
+
+        self.init_weights()
 
     def forward(self,
                 hidden_states,
@@ -426,10 +455,10 @@ class T5Stack(nn.Module):
                 head_mask=None):
 
         batch_size, seq_length = hidden_states.shape[0], hidden_states.shape[1]
-        encoder_seq_length = encoder_hidden_states.shape[1] if encoder_hidden_states is not None else 0
         if attention_mask is None:
             attention_mask = torch.ones(batch_size, seq_length).to(hidden_states.device)
-        if encoder_attention_mask is None:
+        if self.is_decoder and encoder_attention_mask is None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
             encoder_attention_mask = torch.ones(batch_size, encoder_seq_length).to(hidden_states.device)
 
         # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
@@ -444,6 +473,7 @@ class T5Stack(nn.Module):
             if self.config.is_decoder:
                 seq_ids = torch.arange(seq_length, device=hidden_states.device)
                 causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                causal_mask = causal_mask.to(attention_mask)
                 extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
             else:
                 extended_attention_mask = attention_mask[:, None, None, :]
@@ -456,15 +486,18 @@ class T5Stack(nn.Module):
         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
-        # If a 2D ou 3D attention mask is provided for the cross-attention
-        # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
-        if encoder_attention_mask.dim() == 3:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
-        if encoder_attention_mask.dim() == 2:
-            encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+        if self.is_decoder:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+            if encoder_attention_mask.dim() == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if encoder_attention_mask.dim() == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
 
-        encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-        encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
+            encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
+        else:
+            encoder_extended_attention_mask = None
 
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
@@ -474,18 +507,18 @@ class T5Stack(nn.Module):
         if head_mask is not None:
             if head_mask.dim() == 1:
                 head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(-1).unsqueeze(-1)
-                head_mask = head_mask.expand(self.config.num_hidden_layers, -1, -1, -1, -1)
+                head_mask = head_mask.expand(self.config.num_layers, -1, -1, -1, -1)
             elif head_mask.dim() == 2:
                 head_mask = head_mask.unsqueeze(1).unsqueeze(-1).unsqueeze(-1)  # We can specify head_mask for each layer
             head_mask = head_mask.to(dtype=next(self.parameters()).dtype) # switch to fload if need + fp16 compatibility
         else:
-            head_mask = [None] * self.config.num_hidden_layers
+            head_mask = [None] * self.config.num_layers
 
         all_hidden_states = ()
         all_attentions = ()
         position_bias = None
         encoder_decoder_position_bias = None
-        for i, layer_module in enumerate(self.layer):
+        for i, layer_module in enumerate(self.blocks):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
@@ -498,8 +531,9 @@ class T5Stack(nn.Module):
                                          head_mask=head_mask[i])
             hidden_states = layer_outputs[0]
             if i == 0:
-                position_bias = layer_outputs[2] if len(layer_outputs) > 3 else None
-                encoder_decoder_position_bias = layer_outputs[4] if len(layer_outputs) > 5 else None
+                position_bias = layer_outputs[2 if self.output_attentions else 1]
+                if self.is_decoder:
+                    encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
 
             if self.output_attentions:
                 all_attentions = all_attentions + (layer_outputs[1],)
@@ -519,27 +553,6 @@ class T5Stack(nn.Module):
         return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 
 
-class T5PreTrainedModel(PreTrainedEncoderDecoder):
-    """ An abstract class to handle weights initialization and
-        a simple interface for dowloading and loading pretrained models.
-    """
-    config_class = T5Config
-    pretrained_model_archive_map = T5_PRETRAINED_MODEL_ARCHIVE_MAP
-    load_tf_weights = load_tf_weights_in_t5
-
-    def _init_weights(self, module):
-        """ Initialize the weights """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
-            module.bias.data.zero_()
-
-
 T5_START_DOCSTRING = r"""    The T5 model was proposed in
     `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
     by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
@@ -620,7 +633,7 @@ class T5Model(T5PreTrainedModel):
     """
     def __init__(self, config):
         super(T5Model, self).__init__(config)
-        self.shared = nn.Embeddings(config.vocab_size, config.d_model)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
 
         encoder_config = copy.deepcopy(config)
         self.encoder = T5Stack(encoder_config)
@@ -631,7 +644,6 @@ class T5Model(T5PreTrainedModel):
 
         self.init_weights()
 
-    @property
     def get_input_embeddings(self):
         return self.shared
 
@@ -646,17 +658,17 @@ class T5Model(T5PreTrainedModel):
         for layer, heads in heads_to_prune.items():
             self.encoder.layer[layer].attention.prune_heads(heads)
 
-    def forward(self, encoder_input_ids, decoder_input_ids, **kwargs):
+    def forward(self, **kwargs):
         # keyword arguments come in 3 flavors: encoder-specific (prefixed by
         # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
         # that apply to the model as whole.
         # We let the specific kwargs override the common ones in case of conflict.
         kwargs_common = dict((k, v) for k, v in kwargs.items()
                              if not k.startswith("encoder_") and not k.startswith("decoder_"))
-        kwargs_decoder = kwargs_common.copy()
         kwargs_encoder = kwargs_common.copy()
-        kwargs_encoder.update(dict((k[len("encoder_") :], v) for k, v in kwargs.items() if k.startswith("encoder_")))
-        kwargs_decoder.update(dict((k[len("decoder_") :], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
 
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
@@ -680,7 +692,7 @@ class T5Model(T5PreTrainedModel):
 
 @add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
     T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
-class T5WithLMHead(T5PreTrainedModel):
+class T5WithLMHeadModel(T5PreTrainedModel):
     r"""
         **lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
             Labels for computing the masked language modeling loss.
@@ -704,14 +716,14 @@ class T5WithLMHead(T5PreTrainedModel):
     Examples::
 
         tokenizer = T5Tokenizer.from_pretrained('t5-base-uncased')
-        model = T5ForMaskedLM.from_pretrained('t5-base-uncased')
+        model = T5WithLMHeadModel.from_pretrained('t5-base-uncased')
         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids, lm_labels=input_ids)
         loss, prediction_scores = outputs[:2]
 
     """
     def __init__(self, config):
-        super(T5ForMaskedLM, self).__init__(config)
+        super(T5WithLMHeadModel, self).__init__(config)
 
         self.transformer = T5Model(config)
         self.lm_head = nn.Linear(config.d_model, config.vocab_size)
@@ -721,11 +733,12 @@ class T5WithLMHead(T5PreTrainedModel):
     def get_output_embeddings(self):
         return self.lm_head
 
-    def forward(self, encoder_input_ids, decoder_input_ids, **kwargs):
-        outputs = self.transformer(encoder_input_ids, decoder_input_ids, **kwargs)
+    def forward(self, **kwargs):
+        lm_labels = kwargs.pop('decoder_lm_labels', None)
+        outputs = self.transformer(**kwargs)
 
         sequence_output = outputs[0]
-        lm_logits = self.cls(sequence_output)
+        lm_logits = self.lm_head(sequence_output)
 
         outputs = (lm_logits,) + outputs[2:]  # Add hidden states and attention if they are here
         if lm_labels is not None:
diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index ddc0f9f3de..42bf9ac3f5 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -73,6 +73,7 @@ class CommonTestCases:
         test_pruning = True
         test_resize_embeddings = True
         test_head_masking = True
+        is_encoder_decoder = False
 
         def test_save_load(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -114,10 +115,9 @@ class CommonTestCases:
             for model_class in self.all_model_classes:
                 model = model_class(config)
                 model.eval()
-                first, second = model(inputs_dict["input_ids"])[0], model(inputs_dict["input_ids"])[0]
+                first, second = model(**inputs_dict)[0], model(**inputs_dict)[0]
                 self.assertEqual(first.ne(second).sum().item(), 0)
 
-
         def test_attention_outputs(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
@@ -127,31 +127,42 @@ class CommonTestCases:
                 model = model_class(config)
                 model.eval()
                 outputs = model(**inputs_dict)
-                attentions = outputs[-1]
+                self_attentions = outputs[-1]
                 self.assertEqual(model.config.output_attentions, True)
                 self.assertEqual(model.config.output_hidden_states, False)
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
+                    list(self_attentions[0].shape[-3:]),
                     [self.model_tester.num_attention_heads,
                     self.model_tester.seq_length,
                     self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
                 out_len = len(outputs)
 
+                if self.is_encoder_decoder:
+                    cross_attentions = outputs[-2]
+                    self.assertEqual(model.config.output_attentions, True)
+                    self.assertEqual(model.config.output_hidden_states, False)
+                    self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+                    self.assertListEqual(
+                        list(cross_attentions[0].shape[-3:]),
+                        [self.model_tester.num_attention_heads,
+                        self.model_tester.seq_length,
+                        self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+
                 # Check attention is always last and order is fine
                 config.output_attentions = True
                 config.output_hidden_states = True
                 model = model_class(config)
                 model.eval()
                 outputs = model(**inputs_dict)
-                self.assertEqual(out_len+1, len(outputs))
+                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
                 self.assertEqual(model.config.output_attentions, True)
                 self.assertEqual(model.config.output_hidden_states, True)
 
-                attentions = outputs[-1]
-                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
+                self_attentions = outputs[-1]
+                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
-                    list(attentions[0].shape[-3:]),
+                    list(self_attentions[0].shape[-3:]),
                     [self.model_tester.num_attention_heads,
                     self.model_tester.seq_length,
                     self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
@@ -214,7 +225,6 @@ class CommonTestCases:
 
                 self.assertTrue(models_equal)
 
-
         def test_headmasking(self):
             if not self.test_head_masking:
                 return
@@ -268,7 +278,6 @@ class CommonTestCases:
                 self.assertNotEqual(
                     attentions[-1][..., -1, :, :].flatten().sum().item(), 0.0)
 
-
         def test_head_pruning(self):
             if not self.test_pruning:
                 return
@@ -411,7 +420,6 @@ class CommonTestCases:
 
                 self.assertDictEqual(model.config.pruned_heads, {0: [0], 1: [1, 2], 2: [1, 2]})
 
-
         def test_hidden_states_output(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py
new file mode 100644
index 0000000000..b8bb828ebd
--- /dev/null
+++ b/transformers/tests/modeling_t5_test.py
@@ -0,0 +1,176 @@
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+
+from transformers import is_torch_available
+
+from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+if is_torch_available():
+    from transformers import (T5Config, T5Model, T5WithLMHeadModel)
+    from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
+else:
+    pytestmark = pytest.mark.skip("Require Torch")
+
+
+class T5ModelTest(CommonTestCases.CommonModelTester):
+
+    all_model_classes = (T5Model, T5WithLMHeadModel) if is_torch_available() else ()
+    test_pruning = False
+    test_torchscript = False
+    test_resize_embeddings = False
+    is_encoder_decoder = True
+
+    class T5ModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     n_positions=14,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     d_ff=37,
+                     relative_attention_num_buckets=8,
+                     dropout_rate=0.1,
+                     initializer_range=0.02,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.n_positions = n_positions
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.d_ff = d_ff
+            self.relative_attention_num_buckets = relative_attention_num_buckets
+            self.dropout_rate = dropout_rate
+            self.initializer_range = initializer_range
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_labels = None
+            if self.use_labels:
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            config = T5Config(
+                vocab_size_or_config_json_file=self.vocab_size,
+                n_positions=self.n_positions,
+                d_model=self.hidden_size,
+                d_ff=self.d_ff,
+                num_layers=self.num_hidden_layers,
+                num_heads=self.num_attention_heads,
+                relative_attention_num_buckets=self.relative_attention_num_buckets,
+                dropout_rate=self.dropout_rate,
+                initializer_range=self.initializer_range)
+
+            return (config, input_ids, input_mask, token_labels)
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
+            model = T5Model(config=config)
+            model.eval()
+            encoder_output, decoder_output = model(encoder_input_ids=input_ids,
+                                                   decoder_input_ids=input_ids,
+                                                   decoder_attention_mask=input_mask)
+            encoder_output, decoder_output = model(encoder_input_ids=input_ids,
+                                                   decoder_input_ids=input_ids)
+
+            result = {
+                "encoder_output": encoder_output,
+                "decoder_output": decoder_output,
+            }
+            self.parent.assertListEqual(
+                list(result["encoder_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(result["decoder_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+
+
+        def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
+            model = T5WithLMHeadModel(config=config)
+            model.eval()
+            loss, prediction_scores = model(encoder_input_ids=input_ids, decoder_input_ids=input_ids,
+                                            decoder_attention_mask=input_mask, decoder_lm_labels=token_labels)
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.check_loss_output(result)
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, input_mask, token_labels) = config_and_inputs
+            inputs_dict = {'encoder_input_ids': input_ids,
+                           'decoder_input_ids': input_ids,
+                           'decoder_attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = T5ModelTest.T5ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_t5_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_model(*config_and_inputs)
+
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = T5Model.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py
new file mode 100644
index 0000000000..fac6763432
--- /dev/null
+++ b/transformers/tests/modeling_tf_t5_test.py
@@ -0,0 +1,190 @@
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+import shutil
+import pytest
+import sys
+
+from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
+from .configuration_common_test import ConfigTester
+
+from transformers import T5Config, is_tf_available
+
+if False:  # is_tf_available():
+    import tensorflow as tf
+    from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+else:
+    pytestmark = pytest.mark.skip("Require TensorFlow")
+
+
+class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
+
+    all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if False  else () # is_tf_available() else ()
+
+    class TFT5ModelTester(object):
+
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
+
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = T5Config(
+                vocab_size_or_config_json_file=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range)
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def create_and_check_t5_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFT5Model(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            sequence_output, pooled_output = model(inputs)
+
+            inputs = [input_ids, input_mask]
+            sequence_output, pooled_output = model(inputs)
+
+            sequence_output, pooled_output = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output.numpy(),
+                "pooled_output": pooled_output.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
+
+
+        def create_and_check_t5_with_lm_head(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+            model = TFT5WithLMHeadModel(config=config)
+            inputs = {'input_ids': input_ids,
+                      'attention_mask': input_mask,
+                      'token_type_ids': token_type_ids}
+            prediction_scores, = model(inputs)
+            result = {
+                "prediction_scores": prediction_scores.numpy(),
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].shape),
+                [self.batch_size, self.seq_length, self.vocab_size])
+
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = TFT5ModelTest.TFT5ModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=T5Config, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_t5_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_model(*config_and_inputs)
+
+    def test_with_lm_head(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/transformers_test/"
+        for model_name in ['t5-base']:
+            model = TFT5Model.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/transformers/tests/tokenization_t5_test.py b/transformers/tests/tokenization_t5_test.py
new file mode 100644
index 0000000000..9362487d8d
--- /dev/null
+++ b/transformers/tests/tokenization_t5_test.py
@@ -0,0 +1,77 @@
+# coding=utf-8
+# Copyright 2018 Google T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+import pytest
+
+from transformers.tokenization_t5 import (T5Tokenizer, SPIECE_UNDERLINE)
+
+from .tokenization_tests_commons import CommonTestCases
+
+SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)),
+                    'fixtures/test_sentencepiece.model')
+
+class T5TokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = T5Tokenizer
+
+    def setUp(self):
+        super(T5TokenizationTest, self).setUp()
+
+        # We have a SentencePiece fixture for testing
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer.save_pretrained(self.tmpdirname)
+
+    def get_tokenizer(self, **kwargs):
+        return T5Tokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"This is a test"
+        output_text = u"This is a test"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB, keep_accents=True)
+
+        tokens = tokenizer.tokenize(u'This is a test')
+        self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
+
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(tokens), [285, 46, 10, 170, 382])
+
+        tokens = tokenizer.tokenize(u"I was born in 92000, and this is falsé.")
+        self.assertListEqual(tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                    u'or', u'n', SPIECE_UNDERLINE + u'in', SPIECE_UNDERLINE + u'',
+                                    u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                    SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's', u'é', u'.'])
+        ids = tokenizer.convert_tokens_to_ids(tokens)
+        self.assertListEqual(
+            ids, [8, 21, 84, 55, 24, 19, 7, 0,
+                602, 347, 347, 347, 3, 12, 66,
+                46, 72, 80, 6, 0, 4])
+
+        back_tokens = tokenizer.convert_ids_to_tokens(ids)
+        self.assertListEqual(back_tokens, [SPIECE_UNDERLINE + u'I', SPIECE_UNDERLINE + u'was', SPIECE_UNDERLINE + u'b',
+                                        u'or', u'n', SPIECE_UNDERLINE + u'in',
+                                        SPIECE_UNDERLINE + u'', u'<unk>', u'2', u'0', u'0', u'0', u',',
+                                        SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
+                                        SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u's',
+                                        u'<unk>', u'.'])
+
+
+if __name__ == '__main__':
+    unittest.main()

From ba10065c4b44d733d135ad6dc1b8a77f88c6dbb9 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 7 Nov 2019 15:55:36 +0100
Subject: [PATCH 007/110] update model, conversion script, tests and template

---
 ...t_xxx_original_tf_checkpoint_to_pytorch.py |  10 +-
 transformers/__init__.py                      |   1 +
 transformers/configuration_t5.py              |  13 +-
 ...rt_t5_original_tf_checkpoint_to_pytorch.py |  12 +-
 transformers/modeling_t5.py                   | 129 ++++++++++++------
 transformers/tests/modeling_common_test.py    |  41 +++---
 transformers/tests/modeling_t5_test.py        |  12 +-
 transformers/tokenization_t5.py               |   1 +
 8 files changed, 135 insertions(+), 84 deletions(-)

diff --git a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
index d50d129cba..9d389deaad 100755
--- a/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
+++ b/templates/adding_a_new_model/convert_xxx_original_tf_checkpoint_to_pytorch.py
@@ -26,9 +26,9 @@ from transformers import XxxConfig, XxxForPreTraining, load_tf_weights_in_xxx
 import logging
 logging.basicConfig(level=logging.INFO)
 
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, xxx_config_file, pytorch_dump_path):
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
     # Initialise PyTorch model
-    config = XxxConfig.from_json_file(xxx_config_file)
+    config = XxxConfig.from_json_file(config_file)
     print("Building PyTorch model from configuration: {}".format(str(config)))
     model = XxxForPreTraining(config)
 
@@ -48,11 +48,11 @@ if __name__ == "__main__":
                         type = str,
                         required = True,
                         help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--xxx_config_file",
+    parser.add_argument("--config_file",
                         default = None,
                         type = str,
                         required = True,
-                        help = "The config json file corresponding to the pre-trained XXX model. \n"
+                        help = "The config json file corresponding to the pre-trained model. \n"
                             "This specifies the model architecture.")
     parser.add_argument("--pytorch_dump_path",
                         default = None,
@@ -61,5 +61,5 @@ if __name__ == "__main__":
                         help = "Path to the output PyTorch model.")
     args = parser.parse_args()
     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.xxx_config_file,
+                                     args.config_file,
                                      args.pytorch_dump_path)
diff --git a/transformers/__init__.py b/transformers/__init__.py
index bf896276d6..601a068592 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -97,6 +97,7 @@ if is_torch_available():
                                 DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
     from .modeling_encoder_decoder import PreTrainedEncoderDecoder, Model2Model
     from .modeling_t5 import (T5PreTrainedModel, T5Model, T5WithLMHeadModel,
+                              load_tf_weights_in_t5,
                               T5_PRETRAINED_MODEL_ARCHIVE_MAP)
 
     # Optimization
diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py
index 9db918e59f..96e67758ac 100644
--- a/transformers/configuration_t5.py
+++ b/transformers/configuration_t5.py
@@ -57,8 +57,7 @@ class T5Config(PretrainedConfig):
                 (e.g., 512 or 1024 or 2048).
             type_vocab_size: The vocabulary size of the `token_type_ids` passed into
                 `T5Model`.
-            initializer_range: The sttdev of the truncated_normal_initializer for
-                initializing all weight matrices.
+            initializer_factor: A factor for initializing all weight matrices (should be kept to 1.0, used for initialization testing).
             layer_norm_eps: The epsilon used by LayerNorm.
     """
     pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -67,25 +66,27 @@ class T5Config(PretrainedConfig):
                  vocab_size_or_config_json_file=32128,
                  n_positions=512,
                  d_model=512,
+                 d_kv=64,
                  d_ff=2048,
-                 num_layers=12,
-                 num_heads=12,
+                 num_layers=6,
+                 num_heads=8,
                  relative_attention_num_buckets=32,
                  dropout_rate=0.1,
                  layer_norm_epsilon=1e-6,
-                 initializer_range=0.02,
+                 initializer_factor=1.0,
                  **kwargs):
         super(T5Config, self).__init__(**kwargs)
         self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
         self.n_positions = n_positions
         self.d_model = d_model
+        self.d_kv = d_kv
         self.d_ff = d_ff
         self.num_layers = num_layers
         self.num_heads = num_heads
         self.relative_attention_num_buckets = relative_attention_num_buckets
         self.dropout_rate = dropout_rate
         self.layer_norm_epsilon = layer_norm_epsilon
-        self.initializer_range = initializer_range
+        self.initializer_factor = initializer_factor
 
         if isinstance(vocab_size_or_config_json_file, six.string_types):
             with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
diff --git a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
index 608027ebac..2b74d2dd93 100755
--- a/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
+++ b/transformers/convert_t5_original_tf_checkpoint_to_pytorch.py
@@ -21,16 +21,16 @@ from __future__ import print_function
 import argparse
 import torch
 
-from transformers import T5Config, T5ForPreTraining, load_tf_weights_in_t5
+from transformers import T5Config, T5Model, load_tf_weights_in_t5
 
 import logging
 logging.basicConfig(level=logging.INFO)
 
-def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, t5_config_file, pytorch_dump_path):
+def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, config_file, pytorch_dump_path):
     # Initialise PyTorch model
-    config = T5Config.from_json_file(t5_config_file)
+    config = T5Config.from_json_file(config_file)
     print("Building PyTorch model from configuration: {}".format(str(config)))
-    model = T5ForPreTraining(config)
+    model = T5Model(config)
 
     # Load weights from tf checkpoint
     load_tf_weights_in_t5(model, config, tf_checkpoint_path)
@@ -48,7 +48,7 @@ if __name__ == "__main__":
                         type = str,
                         required = True,
                         help = "Path to the TensorFlow checkpoint path.")
-    parser.add_argument("--t5_config_file",
+    parser.add_argument("--config_file",
                         default = None,
                         type = str,
                         required = True,
@@ -61,5 +61,5 @@ if __name__ == "__main__":
                         help = "Path to the output PyTorch model.")
     args = parser.parse_args()
     convert_tf_checkpoint_to_pytorch(args.tf_checkpoint_path,
-                                     args.t5_config_file,
+                                     args.config_file,
                                      args.pytorch_dump_path)
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index ce443cf882..6ed241761a 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -65,34 +65,40 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
     # Load weights from TF model
     init_vars = tf.train.list_variables(tf_path)
     names = []
-    arrays = []
+    tf_weights = {}
     for name, shape in init_vars:
         logger.info("Loading TF weight {} with shape {}".format(name, shape))
         array = tf.train.load_variable(tf_path, name)
         names.append(name)
-        arrays.append(array)
+        tf_weights[name] = array
 
-    for name, array in zip(names, arrays):
-        name = name.split('/')
+    for txt_name in names:
+        name = txt_name.split('/')
         # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
         # which are not required for using pretrained model
         if any(n in ["adam_v", "adam_m", "global_step"] for n in name):
             logger.info("Skipping {}".format("/".join(name)))
+            tf_weights.pop(txt_name, None)
+            continue
+        if '_slot_' in name[-1]:
+            logger.info("Skipping {}".format("/".join(name)))
+            tf_weights.pop(txt_name, None)
             continue
         pointer = model
+        array = tf_weights[txt_name]
         for m_name in name:
             if re.fullmatch(r'[A-Za-z]+_\d+', m_name):
                 l = re.split(r'_(\d+)', m_name)
             else:
                 l = [m_name]
-            if l[0] == 'kernel' or l[0] == 'gamma':
+            if l[0] in ['kernel', 'scale', 'embedding']:
                 pointer = getattr(pointer, 'weight')
-            elif l[0] == 'output_bias' or l[0] == 'beta':
-                pointer = getattr(pointer, 'bias')
-            elif l[0] == 'output_weights':
-                pointer = getattr(pointer, 'weight')
-            elif l[0] == 'squad':
-                pointer = getattr(pointer, 'classifier')
+            # elif l[0] == 'scale':
+            #     pointer = getattr(pointer, 'weight')
+            # elif l[0] == 'output_bias' or l[0] == 'beta':
+            #     pointer = getattr(pointer, 'bias')
+            # elif l[0] == 'squad':
+            #     pointer = getattr(pointer, 'classifier')
             else:
                 try:
                     pointer = getattr(pointer, l[0])
@@ -102,9 +108,10 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
             if len(l) >= 2:
                 num = int(l[1])
                 pointer = pointer[num]
-        if m_name[-11:] == '_embeddings':
+        if l[0] not in ['kernel', 'scale', 'embedding']:
             pointer = getattr(pointer, 'weight')
-        elif m_name == 'kernel':
+        if l[0] != 'embedding':
+            logger.info("Transposing numpy weight of shape {} for {}".format(array.shape, name))
             array = np.transpose(array)
         try:
             assert pointer.shape == array.shape
@@ -112,7 +119,11 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
             e.args += (pointer.shape, array.shape)
             raise
         logger.info("Initialize PyTorch weight {}".format(name))
-        pointer.data = torch.from_numpy(array)
+        pointer.data = torch.from_numpy(array.astype(np.float32))
+        tf_weights.pop(txt_name, None)
+
+    logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
+    # logger.info("Weights not copied to PyTorch model: {}".format(', '.join(tf_weights.keys())))
     return model
 
 
@@ -163,10 +174,13 @@ class T5Attention(nn.Module):
         self.output_attentions = config.output_attentions
         self.relative_attention_num_buckets = config.relative_attention_num_buckets
         self.dim = config.d_model
+        self.d_kv = config.d_kv
         self.n_heads = config.num_heads
         self.dropout = config.dropout_rate
         assert self.dim % self.n_heads == 0
+        assert self.dim // self.n_heads == self.d_kv
 
+        # Mesh TensorFlow initialization to avoid scaling before softmax
         self.q = nn.Linear(self.dim, self.dim, bias=False)
         self.k = nn.Linear(self.dim, self.dim, bias=False)
         self.v = nn.Linear(self.dim, self.dim, bias=False)
@@ -312,8 +326,9 @@ class T5Attention(nn.Module):
         scores += position_bias
 
         if mask is not None:
-            mask = (mask == 0).expand_as(scores)                              # (bs, n_heads, qlen, klen)
-            scores.masked_fill_(mask, -float('inf'))                          # (bs, n_heads, qlen, klen)
+            scores += mask
+            # mask = (mask == 0).expand_as(scores)                              # (bs, n_heads, qlen, klen)
+            # scores.masked_fill_(mask, -float('inf'))                          # (bs, n_heads, qlen, klen)
 
         weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
         weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
@@ -378,34 +393,35 @@ class T5Block(nn.Module):
     def __init__(self, config, has_relative_attention_bias=False):
         super(T5Block, self).__init__()
         self.is_decoder = config.is_decoder
-        self.layer_000 = T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias)
+        self.layer = nn.ModuleList()
+        self.layer.append(T5LayerSelfAttention(config, has_relative_attention_bias=has_relative_attention_bias))
         if self.is_decoder:
-            self.layer_001 = T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias)
-            self.layer_002 = T5LayerFF(config)
+            self.layer.append(T5LayerCrossAttention(config, has_relative_attention_bias=has_relative_attention_bias))
+            self.layer.append(T5LayerFF(config))
         else:
-            self.layer_001 = T5LayerFF(config)
+            self.layer.append(T5LayerFF(config))
 
     def forward(self, hidden_states, attention_mask=None, position_bias=None,
                 encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
                 head_mask=None):
-        self_attention_outputs = self.layer_000(hidden_states,
+        self_attention_outputs = self.layer[0](hidden_states,
                                                 attention_mask=attention_mask,
                                                 position_bias=position_bias,
                                                 head_mask=head_mask)
         hidden_states = self_attention_outputs[0]
         outputs = self_attention_outputs[1:]
 
-        if self.is_decoder:
-            cross_attention_outputs = self.layer_001(hidden_states,
-                                                     kv=encoder_hidden_states,
-                                                     attention_mask=encoder_attention_mask,
-                                                     position_bias=encoder_decoder_position_bias,
-                                                     head_mask=head_mask)
+        if not self.is_decoder:
+            hidden_states = self.layer[1](hidden_states)
+        else:
+            cross_attention_outputs = self.layer[1](hidden_states,
+                                                    kv=encoder_hidden_states,
+                                                    attention_mask=encoder_attention_mask,
+                                                    position_bias=encoder_decoder_position_bias,
+                                                    head_mask=head_mask)
             hidden_states = cross_attention_outputs[0]
             outputs = cross_attention_outputs[1:] + outputs
-            hidden_states = self.layer_002(hidden_states)
-        else:
-            hidden_states = self.layer_001(hidden_states)
+            hidden_states = self.layer[2](hidden_states)
 
         outputs = (hidden_states,) + outputs  # add attentions if we output them
         return outputs
@@ -422,15 +438,36 @@ class T5PreTrainedModel(PreTrainedModel):
 
     def _init_weights(self, module):
         """ Initialize the weights """
-        if isinstance(module, (nn.Linear, nn.Embedding)):
-            # Slightly different from the TF version which uses truncated_normal for initialization
-            # cf https://github.com/pytorch/pytorch/pull/5617
-            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
-        elif isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
-            module.weight.data.fill_(1.0)
-        if isinstance(module, nn.Linear) and module.bias is not None:
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
+            module.weight.data.fill_(factor*1.0)
+        elif isinstance(module, T5Model):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.shared.weight.data.normal_(mean=0.0, std=factor*1.0)
+        elif isinstance(module, T5DenseReluDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(mean=0.0, std=factor*((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, 'bias') and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor*((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, 'bias') and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, T5Attention):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            d_kv = self.config.d_kv
+            n_heads = self.config.num_heads
+            module.q.weight.data.normal_(mean=0.0, std=factor*((d_model * d_kv) ** -0.5))
+            module.k.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
+            module.v.weight.data.normal_(mean=0.0, std=factor*(d_model ** -0.5))
+            module.o.weight.data.normal_(mean=0.0, std=factor*((n_heads * d_kv) ** -0.5))
+            if module.has_relative_attention_bias:
+                module.relative_attention_bias.weight.data.normal_(mean=0.0, std=factor*((d_model) ** -0.5))
 
 
 class T5Stack(T5PreTrainedModel):
@@ -440,8 +477,8 @@ class T5Stack(T5PreTrainedModel):
         self.output_hidden_states = config.output_hidden_states
         self.is_decoder = config.is_decoder
 
-        self.blocks = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0))
-                                     for i in range(config.num_layers)])
+        self.block = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0))
+                                    for i in range(config.num_layers)])
         self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
         self.dropout = nn.Dropout(config.dropout_rate)
 
@@ -518,7 +555,7 @@ class T5Stack(T5PreTrainedModel):
         all_attentions = ()
         position_bias = None
         encoder_decoder_position_bias = None
-        for i, layer_module in enumerate(self.blocks):
+        for i, layer_module in enumerate(self.block):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
 
@@ -724,9 +761,10 @@ class T5WithLMHeadModel(T5PreTrainedModel):
     """
     def __init__(self, config):
         super(T5WithLMHeadModel, self).__init__(config)
+        self.model_dim = config.d_model
 
         self.transformer = T5Model(config)
-        self.lm_head = nn.Linear(config.d_model, config.vocab_size)
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
 
         self.init_weights()
 
@@ -738,15 +776,18 @@ class T5WithLMHeadModel(T5PreTrainedModel):
         outputs = self.transformer(**kwargs)
 
         sequence_output = outputs[0]
+        # Rescale output before projecting on vocab
+        # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
+        sequence_output = sequence_output * (self.model_dim ** -0.5)
         lm_logits = self.lm_head(sequence_output)
 
-        outputs = (lm_logits,) + outputs[2:]  # Add hidden states and attention if they are here
+        outputs = (lm_logits,) + outputs[1:]  # Add hidden states and attention if they are here
         if lm_labels is not None:
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
-            outputs = (loss,) + outputs
+            outputs = (loss,) + outputs  # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
 
         return outputs  # (lm_loss), lm_logits, (hidden_states), (attentions)
diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index 42bf9ac3f5..ee75da605c 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -59,7 +59,7 @@ else:
 def _config_zero_init(config):
     configs_no_init = copy.deepcopy(config)
     for key in configs_no_init.__dict__.keys():
-        if '_range' in key or '_std' in key:
+        if '_range' in key or '_std' in key or 'initializer_factor' in key:
             setattr(configs_no_init, key, 0.0)
     return configs_no_init
 
@@ -83,20 +83,24 @@ class CommonTestCases:
                 model.eval()
                 with torch.no_grad():
                     outputs = model(**inputs_dict)
+                out_2 = outputs[0].numpy()
+                out_2[np.isnan(out_2)] = 0
 
                 with TemporaryDirectory() as tmpdirname:
                     model.save_pretrained(tmpdirname)
                     model = model_class.from_pretrained(tmpdirname)
-                    with torch.no_grad():
-                        after_outputs = model(**inputs_dict)
 
-                    # Make sure we don't have nans
-                    out_1 = after_outputs[0].numpy()
-                    out_2 = outputs[0].numpy()
-                    out_1 = out_1[~np.isnan(out_1)]
-                    out_2 = out_2[~np.isnan(out_2)]
-                    max_diff = np.amax(np.abs(out_1 - out_2))
-                    self.assertLessEqual(max_diff, 1e-5)
+                with torch.no_grad():
+                    after_outputs = model(**inputs_dict)
+
+                # # Make sure we don't have nans
+                out_1 = after_outputs[0].numpy()
+                out_1[np.isnan(out_1)] = 0
+
+                out_1 = out_1 - out_2
+                amax = np.amax(out_1)
+                amin = np.amin(out_1)
+                self.assertLessEqual(max(amax, -amin), 1e-5)
 
         def test_initialization(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -127,27 +131,28 @@ class CommonTestCases:
                 model = model_class(config)
                 model.eval()
                 outputs = model(**inputs_dict)
-                self_attentions = outputs[-1]
+                attentions = outputs[-1]
                 self.assertEqual(model.config.output_attentions, True)
                 self.assertEqual(model.config.output_hidden_states, False)
-                self.assertEqual(len(self_attentions), self.model_tester.num_hidden_layers)
+                self.assertEqual(len(attentions), self.model_tester.num_hidden_layers)
                 self.assertListEqual(
-                    list(self_attentions[0].shape[-3:]),
+                    list(attentions[0].shape[-3:]),
                     [self.model_tester.num_attention_heads,
                     self.model_tester.seq_length,
                     self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
                 out_len = len(outputs)
 
                 if self.is_encoder_decoder:
-                    cross_attentions = outputs[-2]
+                    self.assertEqual(out_len % 2, 0)
+                    decoder_attentions = outputs[(out_len // 2)-1]
                     self.assertEqual(model.config.output_attentions, True)
                     self.assertEqual(model.config.output_hidden_states, False)
-                    self.assertEqual(len(cross_attentions), self.model_tester.num_hidden_layers)
+                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
                     self.assertListEqual(
-                        list(cross_attentions[0].shape[-3:]),
+                        list(decoder_attentions[0].shape[-3:]),
                         [self.model_tester.num_attention_heads,
-                        self.model_tester.seq_length,
-                        self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                         self.model_tester.seq_length,
+                         self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
 
                 # Check attention is always last and order is fine
                 config.output_attentions = True
diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py
index b8bb828ebd..2c67b83c25 100644
--- a/transformers/tests/modeling_t5_test.py
+++ b/transformers/tests/modeling_t5_test.py
@@ -57,7 +57,7 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
                      d_ff=37,
                      relative_attention_num_buckets=8,
                      dropout_rate=0.1,
-                     initializer_range=0.02,
+                     initializer_factor=0.002,
                      scope=None,
                     ):
             self.parent = parent
@@ -74,7 +74,7 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
             self.d_ff = d_ff
             self.relative_attention_num_buckets = relative_attention_num_buckets
             self.dropout_rate = dropout_rate
-            self.initializer_range = initializer_range
+            self.initializer_factor = initializer_factor
             self.scope = scope
 
         def prepare_config_and_inputs(self):
@@ -93,11 +93,12 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
                 n_positions=self.n_positions,
                 d_model=self.hidden_size,
                 d_ff=self.d_ff,
+                d_kv=self.hidden_size // self.num_attention_heads,
                 num_layers=self.num_hidden_layers,
                 num_heads=self.num_attention_heads,
                 relative_attention_num_buckets=self.relative_attention_num_buckets,
                 dropout_rate=self.dropout_rate,
-                initializer_range=self.initializer_range)
+                initializer_factor=self.initializer_factor)
 
             return (config, input_ids, input_mask, token_labels)
 
@@ -130,8 +131,9 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
         def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
             model = T5WithLMHeadModel(config=config)
             model.eval()
-            loss, prediction_scores = model(encoder_input_ids=input_ids, decoder_input_ids=input_ids,
-                                            decoder_attention_mask=input_mask, decoder_lm_labels=token_labels)
+            outputs = model(encoder_input_ids=input_ids, decoder_input_ids=input_ids,
+                            decoder_attention_mask=input_mask, decoder_lm_labels=token_labels)
+            loss, prediction_scores = outputs[0], outputs[1]
             result = {
                 "loss": loss,
                 "prediction_scores": prediction_scores,
diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py
index cff6a41baf..ae898ba0d3 100644
--- a/transformers/tokenization_t5.py
+++ b/transformers/tokenization_t5.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 import os
+from shutil import copyfile
 
 from .tokenization_utils import PreTrainedTokenizer
 

From 8fda532c3cbab9e31fbbfa860f232b69e0f80633 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 7 Nov 2019 17:09:50 +0100
Subject: [PATCH 008/110] fix python 2 sentencepiece tokenization

---
 transformers/tests/tokenization_t5_test.py |  7 +++---
 transformers/tokenization_t5.py            | 26 ++++++++++++++++++----
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/transformers/tests/tokenization_t5_test.py b/transformers/tests/tokenization_t5_test.py
index 9362487d8d..aabb21e443 100644
--- a/transformers/tests/tokenization_t5_test.py
+++ b/transformers/tests/tokenization_t5_test.py
@@ -18,7 +18,8 @@ import os
 import unittest
 import pytest
 
-from transformers.tokenization_t5 import (T5Tokenizer, SPIECE_UNDERLINE)
+from transformers.tokenization_t5 import (T5Tokenizer)
+from transformers.tokenization_xlnet import SPIECE_UNDERLINE
 
 from .tokenization_tests_commons import CommonTestCases
 
@@ -33,7 +34,7 @@ class T5TokenizationTest(CommonTestCases.CommonTokenizerTester):
         super(T5TokenizationTest, self).setUp()
 
         # We have a SentencePiece fixture for testing
-        tokenizer = T5Tokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB)
         tokenizer.save_pretrained(self.tmpdirname)
 
     def get_tokenizer(self, **kwargs):
@@ -45,7 +46,7 @@ class T5TokenizationTest(CommonTestCases.CommonTokenizerTester):
         return input_text, output_text
 
     def test_full_tokenizer(self):
-        tokenizer = T5Tokenizer(SAMPLE_VOCAB, keep_accents=True)
+        tokenizer = T5Tokenizer(SAMPLE_VOCAB)
 
         tokens = tokenizer.tokenize(u'This is a test')
         self.assertListEqual(tokens, [u'▁This', u'▁is', u'▁a', u'▁t', u'est'])
diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py
index ae898ba0d3..93842d29f0 100644
--- a/transformers/tokenization_t5.py
+++ b/transformers/tokenization_t5.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 import os
+import six
 from shutil import copyfile
 
 from .tokenization_utils import PreTrainedTokenizer
@@ -96,18 +97,35 @@ class T5Tokenizer(PreTrainedTokenizer):
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(self.vocab_file)
 
-    def _tokenize(self, text):
+    def _tokenize(self, text, return_unicode=True, sample=False):
         """ Take as input a string and return a list of strings (tokens) for words/sub-words
         """
-        return self.sp_model.EncodeAsPieces(text)
+        if not sample:
+            pieces = self.sp_model.EncodeAsPieces(text)
+        else:
+            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+
+        # convert back to unicode for py2
+        if six.PY2 and return_unicode:
+            ret_pieces = []
+            for piece in pieces:
+                if isinstance(piece, str):
+                    piece = piece.decode('utf-8')
+                ret_pieces.append(piece)
+            pieces = ret_pieces
+
+        return pieces
 
     def _convert_token_to_id(self, token):
         """ Converts a token (str/unicode) in an id using the vocab. """
         return self.sp_model.piece_to_id(token)
 
-    def _convert_id_to_token(self, index):
+    def _convert_id_to_token(self, index, return_unicode=True):
         """Converts an index (integer) in a token (string/unicode) using the vocab."""
-        return self.sp_model.id_to_piece(index)
+        token = self.sp_model.IdToPiece(index)
+        if six.PY2 and return_unicode and isinstance(token, str):
+            token = token.decode('utf-8')
+        return token
 
     def convert_tokens_to_string(self, tokens):
         """ Converts a sequence of tokens (string) in a single string. """

From 727a79b305364522b6853679c5523efd9de7f772 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Nov 2019 11:35:03 +0100
Subject: [PATCH 009/110] added TF2 model and tests - updated templates

---
 .../adding_a_new_model/modeling_tf_xxx.py     |   2 +
 templates/adding_a_new_model/modeling_xxx.py  |   2 +
 transformers/__init__.py                      |   3 +
 transformers/configuration_auto.py            |   6 +-
 transformers/configuration_t5.py              |   3 +-
 transformers/modeling_t5.py                   |  79 +-
 transformers/modeling_tf_pytorch_utils.py     |   4 +-
 transformers/modeling_tf_t5.py                | 783 +++++++++++-------
 transformers/modeling_utils.py                |   6 +-
 transformers/tests/modeling_tf_common_test.py |  23 +-
 transformers/tests/modeling_tf_t5_test.py     | 116 ++-
 11 files changed, 646 insertions(+), 381 deletions(-)

diff --git a/templates/adding_a_new_model/modeling_tf_xxx.py b/templates/adding_a_new_model/modeling_tf_xxx.py
index c661975768..b58817e453 100644
--- a/templates/adding_a_new_model/modeling_tf_xxx.py
+++ b/templates/adding_a_new_model/modeling_tf_xxx.py
@@ -26,6 +26,8 @@ import logging
 import math
 import os
 import sys
+import copy
+import itertools
 from io import open
 
 import numpy as np
diff --git a/templates/adding_a_new_model/modeling_xxx.py b/templates/adding_a_new_model/modeling_xxx.py
index ee705e753c..9c3505f0cf 100644
--- a/templates/adding_a_new_model/modeling_xxx.py
+++ b/templates/adding_a_new_model/modeling_xxx.py
@@ -25,6 +25,8 @@ import logging
 import math
 import os
 import sys
+import copy
+import itertools
 from io import open
 
 import torch
diff --git a/transformers/__init__.py b/transformers/__init__.py
index 601a068592..b882f4d968 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -158,6 +158,9 @@ if is_tf_available():
                                     TFCTRLLMHeadModel,
                                     TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
 
+    from .modeling_tf_t5 import (TFT5PreTrainedModel, TFT5Model, TFT5WithLMHeadModel,
+                                 TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
+
 # TF 2.0 <=> PyTorch conversion utilities
 from .modeling_tf_pytorch_utils import (convert_tf_weight_name_to_pt_weight_name,
                                         load_pytorch_checkpoint_in_tf2_model,
diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py
index edd21a670c..3bee5b84a1 100644
--- a/transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -27,6 +27,7 @@ from .configuration_xlm import XLMConfig
 from .configuration_roberta import RobertaConfig
 from .configuration_distilbert import DistilBertConfig
 from .configuration_ctrl import CTRLConfig
+from .configuration_t5 import T5Config
 
 logger = logging.getLogger(__name__)
 
@@ -64,6 +65,7 @@ class AutoConfig(object):
 
         The configuration class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Config (T5 model)
             - contains `distilbert`: DistilBertConfig (DistilBERT model)
             - contains `bert`: BertConfig (Bert model)
             - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
@@ -114,7 +116,9 @@ class AutoConfig(object):
             assert unused_kwargs == {'foo': False}
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
             return DistilBertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py
index 96e67758ac..83aab66fac 100644
--- a/transformers/configuration_t5.py
+++ b/transformers/configuration_t5.py
@@ -27,8 +27,7 @@ from .configuration_utils import PretrainedConfig
 logger = logging.getLogger(__name__)
 
 T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-config.json",
-    't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-config.json",
+    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
 }
 
 
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index 6ed241761a..6be0ae6863 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -41,8 +41,7 @@ logger = logging.getLogger(__name__)
 # for the pretrained weights provided with the models
 ####################################################
 T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-pytorch_model.bin",
-    't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-pytorch_model.bin",
+    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
 }
 
 ####################################################
@@ -442,7 +441,7 @@ class T5PreTrainedModel(PreTrainedModel):
         if isinstance(module, nn.LayerNorm):
             module.bias.data.zero_()
             module.weight.data.fill_(factor*1.0)
-        elif isinstance(module, T5Model):
+        elif isinstance(module, (T5Model, T5WithLMHeadModel)):
             # Mesh TensorFlow embeddings initialization
             # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
             module.shared.weight.data.normal_(mean=0.0, std=factor*1.0)
@@ -502,11 +501,10 @@ class T5Stack(T5PreTrainedModel):
         # ourselves in which case we just need to make it broadcastable to all heads.
         if attention_mask.dim() == 3:
             extended_attention_mask = attention_mask[:, None, :, :]
-
+        elif attention_mask.dim() == 2:
         # Provided a padding mask of dimensions [batch_size, seq_length]
         # - if the model is a decoder, apply a causal mask in addition to the padding mask
         # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
-        if attention_mask.dim() == 2:
             if self.config.is_decoder:
                 seq_ids = torch.arange(seq_length, device=hidden_states.device)
                 causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
@@ -593,7 +591,7 @@ class T5Stack(T5PreTrainedModel):
 T5_START_DOCSTRING = r"""    The T5 model was proposed in
     `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
     by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
-    It's an encoder decoder pre-trained transformer.
+    It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.
 
     This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
     refer to the PyTorch documentation for all matter related to general usage and behavior.
@@ -634,16 +632,13 @@ T5_INPUTS_DOCSTRING = r"""
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
         **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
 
-@add_start_docstrings("The bare single stack (encoder or decoder) of a T5 Model transformer outputting raw hidden-states"
+@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
                       "without any specific head on top.",
                       T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
 class T5Model(T5PreTrainedModel):
@@ -661,8 +656,8 @@ class T5Model(T5PreTrainedModel):
 
     Examples::
 
-        tokenizer = T5Tokenizer.from_pretrained('t5-base-uncased')
-        model = T5Model.from_pretrained('t5-base-uncased')
+        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        model = T5Model.from_pretrained('t5-small')
         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
@@ -752,8 +747,8 @@ class T5WithLMHeadModel(T5PreTrainedModel):
 
     Examples::
 
-        tokenizer = T5Tokenizer.from_pretrained('t5-base-uncased')
-        model = T5WithLMHeadModel.from_pretrained('t5-base-uncased')
+        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        model = T5WithLMHeadModel.from_pretrained('t5-small')
         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
         outputs = model(input_ids, lm_labels=input_ids)
         loss, prediction_scores = outputs[:2]
@@ -763,31 +758,73 @@ class T5WithLMHeadModel(T5PreTrainedModel):
         super(T5WithLMHeadModel, self).__init__(config)
         self.model_dim = config.d_model
 
-        self.transformer = T5Model(config)
+        self.shared = nn.Embedding(config.vocab_size, config.d_model)
+
+        encoder_config = copy.deepcopy(config)
+        self.encoder = T5Stack(encoder_config)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        self.decoder = T5Stack(decoder_config)
+
         self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
 
         self.init_weights()
 
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+
     def get_output_embeddings(self):
         return self.lm_head
 
     def forward(self, **kwargs):
-        lm_labels = kwargs.pop('decoder_lm_labels', None)
-        outputs = self.transformer(**kwargs)
+        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
+        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
+        # that apply to the model as whole.
+        # We let the specific kwargs override the common ones in case of conflict.
 
-        sequence_output = outputs[0]
+        lm_labels = kwargs.pop('decoder_lm_labels', None)
+
+        kwargs_common = dict((k, v) for k, v in kwargs.items()
+                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+
+        # Encode if needed (training, first prediction pass)
+        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
+        if encoder_hidden_states is None:
+            encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+            hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
+            encoder_hidden_states = encoder_outputs[0]
+        else:
+            encoder_outputs = ()
+
+        # Decode
+        decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+        hidden_states = self.shared(decoder_inputs_ids)  # Convert inputs in embeddings
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
+        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
+        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
+
+        sequence_output = decoder_outputs[0]
         # Rescale output before projecting on vocab
         # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/transformer.py#L586
         sequence_output = sequence_output * (self.model_dim ** -0.5)
         lm_logits = self.lm_head(sequence_output)
 
-        outputs = (lm_logits,) + outputs[1:]  # Add hidden states and attention if they are here
+        decoder_outputs = (lm_logits,) + decoder_outputs[1:]  # Add hidden states and attention if they are here
         if lm_labels is not None:
             shift_logits = lm_logits[..., :-1, :].contiguous()
             shift_labels = lm_labels[..., 1:].contiguous()
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                             shift_labels.view(-1))
-            outputs = (loss,) + outputs  # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
+            decoder_outputs = (loss,) + decoder_outputs  # TODO(thom): Add z_loss https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L666
 
-        return outputs  # (lm_loss), lm_logits, (hidden_states), (attentions)
+        return decoder_outputs + encoder_outputs
diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py
index 88ce4d4610..6330c2748c 100644
--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -156,7 +156,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
             e.args += (symbolic_weight.shape, array.shape)
             raise e
 
-        logger.info("Initialize TF weight {}".format(symbolic_weight.name))
+        # logger.warning("Initialize TF weight {}".format(symbolic_weight.name))
 
         weight_value_tuples.append((symbolic_weight, array))
         all_pytorch_weights.discard(name)
@@ -269,7 +269,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
             e.args += (pt_weight.shape, array.shape)
             raise e
 
-        logger.info("Initialize PyTorch weight {}".format(pt_weight_name))
+        # logger.warning("Initialize PyTorch weight {}".format(pt_weight_name))
 
         new_pt_params_dict[pt_weight_name] = torch.from_numpy(array)
         loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array)
diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
index deb453846c..c1de4745c2 100644
--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
@@ -22,24 +22,21 @@ import logging
 import math
 import os
 import sys
+import copy
+import itertools
 from io import open
 
 import numpy as np
 import tensorflow as tf
 
 from .configuration_t5 import T5Config
-from .modeling_tf_utils import TFPreTrainedModel, get_initializer
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer, DUMMY_INPUTS
 from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
-####################################################
-# This dict contrains shortcut names and associated url
-# for the pretrained weights provided with the models
-####################################################
 TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    't5-base-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-uncased-tf_model.h5",
-    't5-large-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-uncased-tf_model.h5",
+    't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5",
 }
 
 ####################################################
@@ -48,33 +45,294 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
 # - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model)
 ####################################################
 
-####################################################
-# Here is an example of typical layer in a TF 2.0 model of the library
-# The classes are usually identical to the PyTorch ones and prefixed with 'TF'.
-#
-# Note that class __init__ parameters includes **kwargs (send to 'super').
-# This let us have a control on class scope and variable names:
-# More precisely, we set the names of the class attributes (lower level layers) to
-# to the equivalent attributes names in the PyTorch model so we can have equivalent
-# class and scope structure between PyTorch and TF 2.0 models and easily load one in the other.
-#
-# See the conversion methods in modeling_tf_pytorch_utils.py for more details
-####################################################
-class TFT5Layer(tf.keras.layers.Layer):
+class TFT5DenseReluDense(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
-        super(TFT5Layer, self).__init__(**kwargs)
-        self.attention = TFT5Attention(config, name='attention')
-        self.intermediate = TFT5Intermediate(config, name='intermediate')
-        self.transformer_output = TFT5Output(config, name='output')
+        super(TFT5DenseReluDense, self).__init__(**kwargs)
+        self.wi = tf.keras.layers.Dense(config.d_ff, use_bias=False, name='wi')
+        self.wo = tf.keras.layers.Dense(config.d_model, use_bias=False, name='wo')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+        self.act = tf.keras.activations.relu
 
-    def call(self, inputs, training=False):
-        hidden_states, attention_mask, head_mask = inputs
+    def call(self, hidden_states, training=False):
+        h = self.wi(hidden_states)
+        h = self.act(h)
+        h = self.dropout(h, training=training)
+        h = self.wo(h)
+        return h
 
-        attention_outputs = self.attention([hidden_states, attention_mask, head_mask], training=training)
-        attention_output = attention_outputs[0]
-        intermediate_output = self.intermediate(attention_output)
-        layer_output = self.transformer_output([intermediate_output, attention_output], training=training)
-        outputs = (layer_output,) + attention_outputs[1:]  # add attentions if we output them
+
+class TFT5LayerFF(tf.keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super(TFT5LayerFF, self).__init__(**kwargs)
+        self.DenseReluDense = TFT5DenseReluDense(config, name='DenseReluDense')
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon,
+                                                             name='layer_norm')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+
+    def call(self, hidden_states, training=False):
+        norm_x = self.layer_norm(hidden_states)
+        y = self.DenseReluDense(norm_x, training=training)
+        layer_output = hidden_states + self.dropout(y, training=training)
+        return layer_output
+
+
+class TFT5Attention(tf.keras.layers.Layer):
+    NEW_ID = itertools.count()
+
+    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
+        super(TFT5Attention, self).__init__(**kwargs)
+        self.layer_id = next(TFT5Attention.NEW_ID)
+        self.is_decoder = config.is_decoder
+        self.has_relative_attention_bias = has_relative_attention_bias
+
+        self.output_attentions = config.output_attentions
+        self.relative_attention_num_buckets = config.relative_attention_num_buckets
+        self.dim = config.d_model
+        self.d_kv = config.d_kv
+        self.n_heads = config.num_heads
+        assert self.dim % self.n_heads == 0
+        assert self.dim // self.n_heads == self.d_kv
+
+        # Mesh TensorFlow initialization to avoid scaling before softmax
+        self.q = tf.keras.layers.Dense(self.dim, use_bias=False, name='q')
+        self.k = tf.keras.layers.Dense(self.dim, use_bias=False, name='k')
+        self.v = tf.keras.layers.Dense(self.dim, use_bias=False, name='v')
+        self.o = tf.keras.layers.Dense(self.dim, use_bias=False, name='o')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+
+        if self.has_relative_attention_bias:
+            self.relative_attention_bias = tf.keras.layers.Embedding(self.relative_attention_num_buckets,
+                                                                     self.n_heads,
+                                                                     name='relative_attention_bias')
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    @staticmethod
+    def _relative_position_bucket(relative_position,
+                                  bidirectional=True,
+                                  num_buckets=32,
+                                  max_distance=128):
+        """
+        Adapted from Mesh Tensorflow:
+        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593
+
+        Translate relative position to a bucket number for relative attention.
+        The relative position is defined as memory_position - query_position, i.e.
+        the distance in tokens from the attending position to the attended-to
+        position.  If bidirectional=False, then positive relative positions are
+        invalid.
+        We use smaller buckets for small absolute relative_position and larger buckets
+        for larger absolute relative_positions.  All relative positions >=max_distance
+        map to the same bucket.  All relative positions <=-max_distance map to the
+        same bucket.  This should allow for more graceful generalization to longer
+        sequences than the model has been trained on.
+        Args:
+            relative_position: an int32 Tensor
+            bidirectional: a boolean - whether the attention is bidirectional
+            num_buckets: an integer
+            max_distance: an integer
+        Returns:
+            a Tensor with the same shape as relative_position, containing int32
+            values in the range [0, num_buckets)
+        """
+        ret = 0
+        n = -relative_position
+        if bidirectional:
+            num_buckets //= 2
+            ret += tf.dtypes.cast(tf.math.less(n, 0), tf.int32) * num_buckets
+            n = tf.math.abs(n)
+        else:
+            n = tf.math.maximum(n, 0)
+        # now n is in the range [0, inf)
+        max_exact = num_buckets // 2
+        is_small = tf.math.less(n, max_exact)
+        val_if_large = max_exact + tf.dtypes.cast(
+            tf.math.log(tf.dtypes.cast(n, tf.float32) / max_exact)
+            / math.log(max_distance / max_exact) * (num_buckets - max_exact), tf.int32)
+        val_if_large = tf.math.minimum(val_if_large, num_buckets - 1)
+        ret += tf.where(is_small, n, val_if_large)
+        return ret
+
+    def compute_bias(self, qlen, klen):
+        """ Compute binned relative position bias """
+        context_position = tf.range(qlen)[:, None]
+        memory_position = tf.range(klen)[None, :]
+        relative_position = memory_position - context_position  # shape (qlen, klen)
+        rp_bucket = self._relative_position_bucket(relative_position,
+                                                   bidirectional=not self.is_decoder,
+                                                   num_buckets=self.relative_attention_num_buckets)
+        values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
+        values = tf.expand_dims(tf.transpose(values, [2, 0, 1]), axis=0) # shape (1, num_heads, qlen, klen)
+        return values
+
+    def call(self, input, mask=None, kv=None, position_bias=None, cache=None, head_mask=None, training=False):
+        """
+        Self-attention (if kv is None) or attention over source sentence (provided by kv).
+        """
+        # Input is (bs, qlen, dim)
+        # Mask is (bs, klen) (non-causal) or (bs, klen, klen)
+        bs, qlen, dim = shape_list(input)
+        if kv is None:
+            klen = qlen if cache is None else cache['slen'] + qlen
+        else:
+            klen = shape_list(kv)[1]
+        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
+        n_heads = self.n_heads
+        dim_per_head = self.dim // n_heads
+
+        def shape(x):
+            """  projection """
+            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
+
+        def unshape(x):
+            """  compute context """
+            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
+
+        q = shape(self.q(input))                                          # (bs, n_heads, qlen, dim_per_head)
+        if kv is None:
+            k = shape(self.k(input))                                      # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(input))                                      # (bs, n_heads, qlen, dim_per_head)
+        elif cache is None or self.layer_id not in cache:
+            k = v = kv
+            k = shape(self.k(k))                                          # (bs, n_heads, qlen, dim_per_head)
+            v = shape(self.v(v))                                          # (bs, n_heads, qlen, dim_per_head)
+
+        if cache is not None:
+            if self.layer_id in cache:
+                if kv is None:
+                    k_, v_ = cache[self.layer_id]
+                    k = tf.concat([k_, k], axis=2)                             # (bs, n_heads, klen, dim_per_head)
+                    v = tf.concat([v_, v], axis=2)                             # (bs, n_heads, klen, dim_per_head)
+                else:
+                    k, v = cache[self.layer_id]
+            cache[self.layer_id] = (k, v)
+
+        # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
+        scores = tf.matmul(q, k, transpose_b=True)                            # (bs, n_heads, qlen, klen)
+
+        if position_bias is None:
+            if not self.has_relative_attention_bias:
+                raise ValueError("No position_bias provided and no weights to compute position_bias")
+            position_bias = self.compute_bias(qlen, klen)
+        scores += position_bias
+
+        if mask is not None:
+            scores += mask
+            # mask = (mask == 0).expand_as(scores)                              # (bs, n_heads, qlen, klen)
+            # scores.masked_fill_(mask, -float('inf'))                          # (bs, n_heads, qlen, klen)
+
+        weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
+        weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            weights = weights * head_mask
+
+        context = tf.matmul(weights, v)                                    # (bs, n_heads, qlen, dim_per_head)
+        context = unshape(context)                                            # (bs, qlen, dim)
+
+        context = self.o(context)
+
+        outputs = (context,)
+        if self.output_attentions:
+            outputs = outputs + (weights,)
+        if self.has_relative_attention_bias:
+            outputs = outputs + (position_bias,)
+        return outputs
+
+
+class TFT5LayerSelfAttention(tf.keras.layers.Layer):
+    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
+        super(TFT5LayerSelfAttention, self).__init__(**kwargs)
+        self.SelfAttention = TFT5Attention(config,
+                                           has_relative_attention_bias=has_relative_attention_bias,
+                                           name='SelfAttention')
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon,
+                                                             name='layer_norm')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+
+    def call(self, hidden_states, attention_mask=None, position_bias=None,
+             head_mask=None, training=False):
+        norm_x = self.layer_norm(hidden_states)
+        attention_output = self.SelfAttention(norm_x,
+                                              mask=attention_mask,
+                                              position_bias=position_bias,
+                                              head_mask=head_mask,
+                                              training=training)
+        y = attention_output[0]
+        layer_output = hidden_states + self.dropout(y, training=training)
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class TFT5LayerCrossAttention(tf.keras.layers.Layer):
+    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
+        super(TFT5LayerCrossAttention, self).__init__(**kwargs)
+        self.EncDecAttention = TFT5Attention(config,
+                                           has_relative_attention_bias=has_relative_attention_bias,
+                                           name='EncDecAttention')
+        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon,
+                                                             name='layer_norm')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
+
+    def call(self, hidden_states, kv, attention_mask=None, position_bias=None,
+             head_mask=None, training=False):
+        norm_x = self.layer_norm(hidden_states)
+        attention_output = self.EncDecAttention(norm_x,
+                                                mask=attention_mask,
+                                                kv=kv,
+                                                position_bias=position_bias,
+                                                head_mask=head_mask,
+                                                training=training)
+        y = attention_output[0]
+        layer_output = hidden_states + self.dropout(y, training=training)
+        outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
+        return outputs
+
+
+class TFT5Block(tf.keras.layers.Layer):
+    def __init__(self, config, has_relative_attention_bias=False, **kwargs):
+        super(TFT5Block, self).__init__(**kwargs)
+        self.is_decoder = config.is_decoder
+        self.layer = []
+        self.layer.append(TFT5LayerSelfAttention(config,
+                                                 has_relative_attention_bias=has_relative_attention_bias,
+                                                 name='layer_._0'))
+        if self.is_decoder:
+            self.layer.append(TFT5LayerCrossAttention(config,
+                                                      has_relative_attention_bias=has_relative_attention_bias,
+                                                      name='layer_._1'))
+            self.layer.append(TFT5LayerFF(config, name='layer_._2'))
+        else:
+            self.layer.append(TFT5LayerFF(config, name='layer_._1'))
+
+    def call(self, hidden_states, attention_mask=None, position_bias=None,
+             encoder_hidden_states=None, encoder_attention_mask=None, encoder_decoder_position_bias=None,
+             head_mask=None, training=False):
+        self_attention_outputs = self.layer[0](hidden_states,
+                                                attention_mask=attention_mask,
+                                                position_bias=position_bias,
+                                                head_mask=head_mask,
+                                                training=training)
+        hidden_states = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]
+
+        if not self.is_decoder:
+            hidden_states = self.layer[1](hidden_states, training=training)
+        else:
+            cross_attention_outputs = self.layer[1](hidden_states,
+                                                    kv=encoder_hidden_states,
+                                                    attention_mask=encoder_attention_mask,
+                                                    position_bias=encoder_decoder_position_bias,
+                                                    head_mask=head_mask,
+                                                    training=training)
+            hidden_states = cross_attention_outputs[0]
+            outputs = cross_attention_outputs[1:] + outputs
+            hidden_states = self.layer[2](hidden_states, training=training)
+
+        outputs = (hidden_states,) + outputs  # add attentions if we output them
         return outputs
 
 
@@ -85,6 +343,19 @@ class TFT5Layer(tf.keras.layers.Layer):
 class TFT5MainLayer(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFT5MainLayer, self).__init__(**kwargs)
+        self.output_attentions = config.output_attentions
+        self.output_hidden_states = config.output_hidden_states
+        self.is_decoder = config.is_decoder
+        self.config = config
+        self.num_hidden_layers = config.num_layers
+
+        self.block = [TFT5Block(config,
+                                has_relative_attention_bias=bool(i == 0),
+                                name='block_._{}'.format(i))
+                        for i in range(config.num_layers)]
+        self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon,
+                                                                   name='final_layer_norm')
+        self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
     def _resize_token_embeddings(self, new_num_tokens):
         raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
@@ -92,51 +363,56 @@ class TFT5MainLayer(tf.keras.layers.Layer):
     def _prune_heads(self, heads_to_prune):
         raise NotImplementedError  # Not implemented yet in the library fr TF 2.0 models
 
-    def call(self, inputs, attention_mask=None, token_type_ids=None, position_ids=None, head_mask=None, training=False):
-        # We allow three types of multi-inputs:
-        # - traditional keyword arguments in the call method
-        # - all the arguments provided as a dict in the first positional argument of call
-        # - all the arguments provided as a list/tuple (ordered) in the first positional argument of call
-        # The last two options are useful to use the tf.keras fit() method.
-
-        if isinstance(inputs, (tuple, list)):
-            input_ids = inputs[0]
-            attention_mask = inputs[1] if len(inputs) > 1 else attention_mask
-            token_type_ids = inputs[2] if len(inputs) > 2 else token_type_ids
-            position_ids = inputs[3] if len(inputs) > 3 else position_ids
-            head_mask = inputs[4] if len(inputs) > 4 else head_mask
-            assert len(inputs) <= 5, "Too many inputs."
-        elif isinstance(inputs, dict):
-            input_ids = inputs.get('input_ids')
-            attention_mask = inputs.get('attention_mask', attention_mask)
-            token_type_ids = inputs.get('token_type_ids', token_type_ids)
-            position_ids = inputs.get('position_ids', position_ids)
-            head_mask = inputs.get('head_mask', head_mask)
-            assert len(inputs) <= 5, "Too many inputs."
-        else:
-            input_ids = inputs
+    def call(self, hidden_states, attention_mask=None, encoder_hidden_states=None,
+             encoder_attention_mask=None, head_mask=None, training=False):
 
+        batch_size, seq_length = shape_list(hidden_states)[:2]
         if attention_mask is None:
-            attention_mask = tf.fill(tf.shape(input_ids), 1)
-        if token_type_ids is None:
-            token_type_ids = tf.fill(tf.shape(input_ids), 0)
+            attention_mask = tf.fill((batch_size, seq_length), 1)
+        if self.is_decoder and encoder_attention_mask is None:
+            encoder_seq_length = encoder_hidden_states.shape[1]
+            encoder_attention_mask = tf.fill((batch_size, encoder_seq_length), 1)
 
-        # We create a 3D attention mask from a 2D tensor mask.
-        # Sizes are [batch_size, 1, 1, to_seq_length]
-        # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
-        # this attention mask is more simple than the triangular masking of causal attention
-        # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
-        extended_attention_mask = attention_mask[:, tf.newaxis, tf.newaxis, :]
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        attention_mask = tf.cast(attention_mask, dtype=tf.float32)
+        num_dims_attention_mask = len(shape_list(attention_mask))
+        if num_dims_attention_mask == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif num_dims_attention_mask == 2:
+        # Provided a padding mask of dimensions [batch_size, seq_length]
+        # - if the model is a decoder, apply a causal mask in addition to the padding mask
+        # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if self.config.is_decoder:
+                seq_ids = tf.range(seq_length)
+                causal_mask = tf.less_equal(tf.tile(seq_ids[None, None, :], (batch_size, seq_length, 1)),
+                                            seq_ids[None, :, None])
+                causal_mask = tf.cast(causal_mask, dtype=tf.float32)
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
 
         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
         # masked positions, this operation will create a tensor which is 0.0 for
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-
-        extended_attention_mask = tf.cast(extended_attention_mask, tf.float32)
         extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
 
+        if self.is_decoder:
+            # If a 2D ou 3D attention mask is provided for the cross-attention
+            # we need to make broadcastabe to [batch_size, num_heads, seq_length, seq_length]
+            encoder_attention_mask = tf.cast(encoder_attention_mask, dtype=tf.float32)
+            num_dims_encoder_attention_mask = len(shape_list(encoder_attention_mask))
+            if num_dims_encoder_attention_mask == 3:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, :, :]
+            if num_dims_encoder_attention_mask == 2:
+                encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
+
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
+        else:
+            encoder_extended_attention_mask = None
+
         # Prepare head mask if needed
         # 1.0 in head_mask indicate we keep the head
         # attention_probs has shape bsz x n_heads x N x N
@@ -148,14 +424,44 @@ class TFT5MainLayer(tf.keras.layers.Layer):
             head_mask = [None] * self.num_hidden_layers
             # head_mask = tf.constant([0] * self.num_hidden_layers)
 
-        ##################################
-        # Replace this with your model code
-        embedding_output = self.embeddings(input_ids, position_ids=position_ids, token_type_ids=token_type_ids)
-        encoder_outputs = self.encoder([embedding_output, extended_attention_mask, head_mask], training=training)
-        sequence_output = encoder_outputs[0]
-        outputs = (sequence_output,) + encoder_outputs[1:]  # add hidden_states and attentions if they are here
+        all_hidden_states = ()
+        all_attentions = ()
+        position_bias = None
+        encoder_decoder_position_bias = None
+        for i, layer_module in enumerate(self.block):
+            if self.output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
 
-        return outputs  # sequence_output, (hidden_states), (attentions)
+            layer_outputs = layer_module(hidden_states,
+                                         attention_mask=extended_attention_mask,
+                                         position_bias=position_bias,
+                                         encoder_hidden_states=encoder_hidden_states,
+                                         encoder_attention_mask=encoder_extended_attention_mask,
+                                         encoder_decoder_position_bias=encoder_decoder_position_bias,
+                                         head_mask=head_mask[i],
+                                         training=training)
+            hidden_states = layer_outputs[0]
+            if i == 0:
+                position_bias = layer_outputs[2 if self.output_attentions else 1]
+                if self.is_decoder:
+                    encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
+
+            if self.output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.final_layer_norm(hidden_states)
+        layer_output = self.dropout(hidden_states, training=training)
+
+        # Add last layer
+        if self.output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        outputs = (hidden_states,)
+        if self.output_hidden_states:
+            outputs = outputs + (all_hidden_states,)
+        if self.output_attentions:
+            outputs = outputs + (all_attentions,)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 
 
 ####################################################
@@ -173,18 +479,26 @@ class TFT5PreTrainedModel(TFPreTrainedModel):
     pretrained_model_archive_map = TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
     base_model_prefix = "transformer"
 
+    @property
+    def dummy_inputs(self):
+        input_ids = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+        input_mask = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        dummy_inputs = {'decoder_input_ids': input_ids,
+                        'encoder_input_ids': input_ids,
+                        'decoder_attention_mask': input_mask}
+        return dummy_inputs
 
-T5_START_DOCSTRING = r"""    The XXX model was proposed in
-    `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_
-    by Jacob Devlin, Ming-Wei Chang, Kenton Lee and Kristina Toutanova. It's a bidirectional transformer
-    pre-trained using a combination of masked language modeling objective and next sentence prediction
-    on a large corpus comprising the Toronto Book Corpus and Wikipedia.
+
+T5_START_DOCSTRING = r"""    The T5 model was proposed in
+    `Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`_
+    by Colin Raffel, Noam Shazeer, Adam Roberts, Katherine Lee, Sharan Narang, Michael Matena, Yanqi Zhou, Wei Li, Peter J. Liu.
+    It's an encoder decoder transformer pre-trained in a text-to-text denoising generative setting.
 
     This model is a tf.keras.Model `tf.keras.Model`_ sub-class. Use it as a regular TF 2.0 Keras Model and
     refer to the TF 2.0 documentation for all matter related to general usage and behavior.
 
-    .. _`XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`:
-        https://arxiv.org/abs/1810.04805
+    .. _`Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer`:
+        https://arxiv.org/abs/1910.10683
 
     .. _`tf.keras.Model`:
         https://www.tensorflow.org/versions/r2.0/api_docs/python/tf/keras/Model
@@ -206,67 +520,50 @@ T5_START_DOCSTRING = r"""    The XXX model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~transformers.XxxConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.T5Config`): Model configuration class with all the parameters of the model. 
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
-XXX_INPUTS_DOCSTRING = r"""
+T5_INPUTS_DOCSTRING = r"""
     Inputs:
         **input_ids**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
             Indices of input sequence tokens in the vocabulary.
-            To match pre-training, XXX input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+            To match pre-training, T5 input sequence should be formatted with [CLS] and [SEP] tokens as follows:
 
             (a) For sequence pairs:
 
                 ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-                
-                ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
 
             (b) For single sequences:
 
                 ``tokens:         [CLS] the dog is hairy . [SEP]``
-                
-                ``token_type_ids:   0   0   0   0  0     0   0``
 
-            Xxx is a model with absolute position embeddings so it's usually advised to pad the inputs on
-            the right rather than the left.
 
-            Indices can be obtained using :class:`transformers.XxxTokenizer`.
+            T5 is a model with relative position embeddings so you should be able to pad the inputs on
+            the right or the left.
+
+            Indices can be obtained using :class:`transformers.T5Tokenizer`.
             See :func:`transformers.PreTrainedTokenizer.encode` and
             :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
         **attention_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
             Mask to avoid performing attention on padding token indices.
             Mask values selected in ``[0, 1]``:
             ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
-        **token_type_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
-            Segment token indices to indicate first and second portions of the inputs.
-            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
-            corresponds to a `sentence B` token
-            (see `XXX: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
-        **position_ids**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length)``:
-            Indices of positions of each input sequence tokens in the position embeddings.
-            Selected in the range ``[0, config.max_position_embeddings - 1]``.
         **head_mask**: (`optional`) ``Numpy array`` or ``tf.Tensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
             Mask to nullify selected heads of the self-attention modules.
             Mask values selected in ``[0, 1]``:
             ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
 """
 
-@add_start_docstrings("The bare Xxx Model transformer outputing raw hidden-states without any specific head on top.",
-                      XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
-class TFXxxModel(TFXxxPreTrainedModel):
+@add_start_docstrings("The bare T5 Model transformer outputting raw hidden-states"
+                      "without any specific head on top.",
+                      T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+class TFT5Model(TFT5PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **last_hidden_state**: ``tf.Tensor`` of shape ``(batch_size, sequence_length, hidden_size)``
             Sequence of hidden-states at the output of the last layer of the model.
-        **pooler_output**: ``tf.Tensor`` of shape ``(batch_size, hidden_size)``
-            Last layer hidden-state of the first token of the sequence (classification token)
-            further processed by a Linear layer and a Tanh activation function. The Linear
-            layer weights are trained from the next sentence prediction (classification)
-            objective during Xxx pretraining. This output is usually *not* a good summary
-            of the semantic content of the input, you're often better with averaging or pooling
-            the sequence of hidden-states for the whole input sequence.
         **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
             list of ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
             of shape ``(batch_size, sequence_length, hidden_size)``:
@@ -278,27 +575,68 @@ class TFXxxModel(TFXxxPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from transformers import XxxTokenizer, TFXxxModel
+        from transformers import T5Tokenizer, TFT5Model
 
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = TFXxxModel.from_pretrained('xxx-base-uncased')
+        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        model = TFT5Model.from_pretrained('t5-small')
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
     def __init__(self, config, *inputs, **kwargs):
-        super(TFXxxModel, self).__init__(config, *inputs, **kwargs)
-        self.transformer = TFXxxMainLayer(config, name='transformer')
+        super(TFT5Model, self).__init__(config, *inputs, **kwargs)
+        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model,
+                                         name='shared')
 
-    def call(self, inputs, **kwargs):
-        outputs = self.transformer(inputs, **kwargs)
-        return outputs
+        encoder_config = copy.deepcopy(config)
+        self.encoder = TFT5MainLayer(encoder_config, name='encoder')
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        self.decoder = TFT5MainLayer(decoder_config, name='decoder')
+
+    def call(self, decoder_input_ids, **kwargs):
+        # We allow two types of multi-inputs:
+        # - traditional keyword arguments in the call method
+        # - all the arguments provided as a dict in the first positional argument of call
+        # The last option is useful to use the tf.keras fit() method.
+
+        if isinstance(decoder_input_ids, dict):
+            kwargs.update(decoder_input_ids)
+        else:
+            kwargs['decoder_input_ids'] = decoder_input_ids
+
+        kwargs_common = dict((k, v) for k, v in kwargs.items()
+                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
+
+        # Encode if needed (training, first prediction pass)
+        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
+        if encoder_hidden_states is None:
+            encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+            hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
+            encoder_hidden_states = encoder_outputs[0]
+        else:
+            encoder_outputs = ()
+
+        # Decode
+        decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+        hidden_states = self.shared(decoder_inputs_ids)  # Convert inputs in embeddings
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
+        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
+        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
+
+        return decoder_outputs + encoder_outputs
 
 
-@add_start_docstrings("""Xxx Model with a `language modeling` head on top. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
-class TFXxxForMaskedLM(TFXxxPreTrainedModel):
+@add_start_docstrings("""T5 Model with a `language modeling` head on top. """,
+    T5_START_DOCSTRING, T5_INPUTS_DOCSTRING)
+class TFT5WithLMHeadModel(TFT5PreTrainedModel):
     r"""
     Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
         **prediction_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
@@ -314,183 +652,66 @@ class TFXxxForMaskedLM(TFXxxPreTrainedModel):
     Examples::
 
         import tensorflow as tf
-        from transformers import XxxTokenizer, TFXxxForMaskedLM
+        from transformers import T5Tokenizer, TFT5WithLMHeadModel
 
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = TFXxxForMaskedLM.from_pretrained('xxx-base-uncased')
+        tokenizer = T5Tokenizer.from_pretrained('t5-small')
+        model = TFT5WithLMHeadModel.from_pretrained('t5-small')
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
         outputs = model(input_ids)
         prediction_scores = outputs[0]
 
     """
     def __init__(self, config, *inputs, **kwargs):
-        super(TFXxxForMaskedLM, self).__init__(config, *inputs, **kwargs)
+        super(TFT5WithLMHeadModel, self).__init__(config, *inputs, **kwargs)
+        self.model_dim = config.d_model
 
-        self.transformer = TFXxxMainLayer(config, name='transformer')
-        self.mlm = TFXxxMLMHead(config, self.transformer.embeddings, name='mlm')
+        self.shared = TFSharedEmbeddings(config.vocab_size, config.d_model,
+                                         name='shared')
 
-    def call(self, inputs, **kwargs):
-        outputs = self.transformer(inputs, **kwargs)
+        encoder_config = copy.deepcopy(config)
+        self.encoder = TFT5MainLayer(encoder_config, name='encoder')
 
-        sequence_output = outputs[0]
-        prediction_scores = self.mlm(sequence_output, training=kwargs.get('training', False))
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        self.decoder = TFT5MainLayer(decoder_config, name='decoder')
 
-        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
+    def call(self, decoder_input_ids, **kwargs):
+        # We allow two types of multi-inputs:
+        # - traditional keyword arguments in the call method
+        # - all the arguments provided as a dict in the first positional argument of call
+        # The last option is useful to use the tf.keras fit() method.
 
-        return outputs  # prediction_scores, (hidden_states), (attentions)
+        if isinstance(decoder_input_ids, dict):
+            kwargs.update(decoder_input_ids)
+        else:
+            kwargs['decoder_input_ids'] = decoder_input_ids
 
+        kwargs_common = dict((k, v) for k, v in kwargs.items()
+                             if not k.startswith("encoder_") and not k.startswith("decoder_"))
+        kwargs_encoder = kwargs_common.copy()
+        kwargs_decoder = kwargs_common.copy()
+        kwargs_encoder.update(dict((k[len("encoder_"):], v) for k, v in kwargs.items() if k.startswith("encoder_")))
+        kwargs_decoder.update(dict((k[len("decoder_"):], v) for k, v in kwargs.items() if k.startswith("decoder_")))
 
-@add_start_docstrings("""Xxx Model transformer with a sequence classification/regression head on top (a linear layer on top of
-    the pooled output) e.g. for GLUE tasks. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
-class TFXxxForSequenceClassification(TFXxxPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **logits**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, config.num_labels)``
-            Classification (or regression if config.num_labels==1) scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+        # Encode if needed (training, first prediction pass)
+        encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
+        if encoder_hidden_states is None:
+            encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+            hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+            encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
+            encoder_hidden_states = encoder_outputs[0]
+        else:
+            encoder_outputs = ()
 
-    Examples::
+        # Decode
+        decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+        hidden_states = self.shared(decoder_inputs_ids)  # Convert inputs in embeddings
+        kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
+        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
+        decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
 
-        import tensorflow as tf
-        from transformers import XxxTokenizer, TFXxxForSequenceClassification
+        sequence_output = decoder_outputs[0] * (self.model_dim ** -0.5)
+        lm_logits = self.shared(sequence_output, mode="linear")
+        decoder_outputs = (lm_logits,) + decoder_outputs[1:]
 
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = TFXxxForSequenceClassification.from_pretrained('xxx-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        logits = outputs[0]
-
-    """
-    def __init__(self, config, *inputs, **kwargs):
-        super(TFXxxForSequenceClassification, self).__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.transformer = TFXxxMainLayer(config, name='transformer')
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
-
-    def call(self, inputs, **kwargs):
-        outputs = self.transformer(inputs, **kwargs)
-
-        pooled_output = outputs[1]
-
-        pooled_output = self.dropout(pooled_output, training=kwargs.get('training', False))
-        logits = self.classifier(pooled_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        return outputs  # logits, (hidden_states), (attentions)
-
-
-@add_start_docstrings("""Xxx Model with a token classification head on top (a linear layer on top of
-    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
-class TFXxxForTokenClassification(TFXxxPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
-            Classification scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XxxTokenizer, TFXxxForTokenClassification
-
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = TFXxxForTokenClassification.from_pretrained('xxx-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        scores = outputs[0]
-
-    """
-    def __init__(self, config, *inputs, **kwargs):
-        super(TFXxxForTokenClassification, self).__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.transformer = TFXxxMainLayer(config, name='transformer')
-        self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
-        self.classifier = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='classifier')
-
-    def call(self, inputs, **kwargs):
-        outputs = self.transformer(inputs, **kwargs)
-
-        sequence_output = outputs[0]
-
-        sequence_output = self.dropout(sequence_output, training=kwargs.get('training', False))
-        logits = self.classifier(sequence_output)
-
-        outputs = (logits,) + outputs[2:]  # add hidden states and attention if they are here
-
-        return outputs  # scores, (hidden_states), (attentions)
-
-
-@add_start_docstrings("""Xxx Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear layers on top of
-    the hidden-states output to compute `span start logits` and `span end logits`). """,
-    XXX_START_DOCSTRING, XXX_INPUTS_DOCSTRING)
-class TFXxxForQuestionAnswering(TFXxxPreTrainedModel):
-    r"""
-    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
-        **start_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
-            Span-start scores (before SoftMax).
-        **end_scores**: ``Numpy array`` or ``tf.Tensor`` of shape ``(batch_size, sequence_length,)``
-            Span-end scores (before SoftMax).
-        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for the output of each layer + the output of the embeddings)
-            of shape ``(batch_size, sequence_length, hidden_size)``:
-            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
-        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
-            list of ``Numpy array`` or ``tf.Tensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
-
-    Examples::
-
-        import tensorflow as tf
-        from transformers import XxxTokenizer, TFXxxForQuestionAnswering
-
-        tokenizer = XxxTokenizer.from_pretrained('xxx-base-uncased')
-        model = TFXxxForQuestionAnswering.from_pretrained('xxx-base-uncased')
-        input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
-        start_scores, end_scores = outputs[:2]
-
-    """
-    def __init__(self, config, *inputs, **kwargs):
-        super(TFXxxForQuestionAnswering, self).__init__(config, *inputs, **kwargs)
-        self.num_labels = config.num_labels
-
-        self.transformer = TFXxxMainLayer(config, name='transformer')
-        self.qa_outputs = tf.keras.layers.Dense(config.num_labels,
-                                                kernel_initializer=get_initializer(config.initializer_range),
-                                                name='qa_outputs')
-
-    def call(self, inputs, **kwargs):
-        outputs = self.transformer(inputs, **kwargs)
-
-        sequence_output = outputs[0]
-
-        logits = self.qa_outputs(sequence_output)
-        start_logits, end_logits = tf.split(logits, 2, axis=-1)
-        start_logits = tf.squeeze(start_logits, axis=-1)
-        end_logits = tf.squeeze(end_logits, axis=-1)
-
-        outputs = (start_logits, end_logits,) + outputs[2:]
-
-        return outputs  # start_logits, end_logits, (hidden_states), (attentions)
+        return decoder_outputs + encoder_outputs
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 063f52365d..5b1d3bb458 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -160,8 +160,7 @@ class PreTrainedModel(nn.Module):
         base_model.vocab_size = new_num_tokens
 
         # Tie weights again if needed
-        if hasattr(self, 'tie_weights'):
-            self.tie_weights()
+        self.tie_weights()
 
         return model_embeds
 
@@ -458,8 +457,7 @@ class PreTrainedModel(nn.Module):
                 raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
                                 model.__class__.__name__, "\n\t".join(error_msgs)))
 
-        if hasattr(model, 'tie_weights'):
-            model.tie_weights()  # make sure word embedding weights are still tied
+        model.tie_weights()  # make sure word embedding weights are still tied if needed
 
         # Set model in evaluation mode to desactivate DropOut modules by default
         model.eval()
diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index f636c42889..6c3954a088 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -69,6 +69,7 @@ class TFCommonTestCases:
         test_torchscript = True
         test_pruning = True
         test_resize_embeddings = True
+        is_encoder_decoder = False
 
         def test_initialization(self):
             pass
@@ -156,7 +157,11 @@ class TFCommonTestCases:
         def test_compile_tf_model(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
-            input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
+            if self.is_encoder_decoder:
+                input_ids = {'decoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='decoder_input_ids', dtype='int32'),
+                             'encoder_input_ids': tf.keras.Input(batch_shape=(2, 2000), name='encoder_input_ids', dtype='int32')}
+            else:
+                input_ids = tf.keras.Input(batch_shape=(2, 2000), name='input_ids', dtype='int32')
             optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0)
             loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
             metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
@@ -189,7 +194,7 @@ class TFCommonTestCases:
                 outputs_dict = model(inputs_dict)
 
                 inputs_keywords = copy.deepcopy(inputs_dict)
-                input_ids = inputs_keywords.pop('input_ids')
+                input_ids = inputs_keywords.pop('input_ids', inputs_keywords.pop('decoder_input_ids'))
                 outputs_keywords = model(input_ids, **inputs_keywords)
 
                 output_dict = outputs_dict[0].numpy()
@@ -216,12 +221,24 @@ class TFCommonTestCases:
                     self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
                 out_len = len(outputs)
 
+                if self.is_encoder_decoder:
+                    self.assertEqual(out_len % 2, 0)
+                    decoder_attentions = outputs[(out_len // 2)-1]
+                    self.assertEqual(model.config.output_attentions, True)
+                    self.assertEqual(model.config.output_hidden_states, False)
+                    self.assertEqual(len(decoder_attentions), self.model_tester.num_hidden_layers)
+                    self.assertListEqual(
+                        list(decoder_attentions[0].shape[-3:]),
+                        [self.model_tester.num_attention_heads,
+                         self.model_tester.seq_length,
+                         self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+
                 # Check attention is always last and order is fine
                 config.output_attentions = True
                 config.output_hidden_states = True
                 model = model_class(config)
                 outputs = model(inputs_dict)
-                self.assertEqual(out_len+1, len(outputs))
+                self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
                 self.assertEqual(model.config.output_attentions, True)
                 self.assertEqual(model.config.output_hidden_states, True)
 
diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py
index fac6763432..33f6f895f0 100644
--- a/transformers/tests/modeling_tf_t5_test.py
+++ b/transformers/tests/modeling_tf_t5_test.py
@@ -26,7 +26,7 @@ from .configuration_common_test import ConfigTester
 
 from transformers import T5Config, is_tf_available
 
-if False:  # is_tf_available():
+if is_tf_available():
     import tensorflow as tf
     from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
 else:
@@ -35,7 +35,8 @@ else:
 
 class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
 
-    all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if False  else () # is_tf_available() else ()
+    is_encoder_decoder = True
+    all_model_classes = (TFT5Model, TFT5WithLMHeadModel) if is_tf_available() else ()
 
     class TFT5ModelTester(object):
 
@@ -45,22 +46,16 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
                      seq_length=7,
                      is_training=True,
                      use_input_mask=True,
-                     use_token_type_ids=True,
                      use_labels=True,
                      vocab_size=99,
+                     n_positions=14,
                      hidden_size=32,
                      num_hidden_layers=5,
                      num_attention_heads=4,
-                     intermediate_size=37,
-                     hidden_act="gelu",
-                     hidden_dropout_prob=0.1,
-                     attention_probs_dropout_prob=0.1,
-                     max_position_embeddings=512,
-                     type_vocab_size=16,
-                     type_sequence_label_size=2,
-                     initializer_range=0.02,
-                     num_labels=3,
-                     num_choices=4,
+                     d_ff=37,
+                     relative_attention_num_buckets=8,
+                     dropout_rate=0.1,
+                     initializer_factor=0.002,
                      scope=None,
                     ):
             self.parent = parent
@@ -68,22 +63,16 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
             self.seq_length = seq_length
             self.is_training = is_training
             self.use_input_mask = use_input_mask
-            self.use_token_type_ids = use_token_type_ids
             self.use_labels = use_labels
             self.vocab_size = vocab_size
+            self.n_positions = n_positions
             self.hidden_size = hidden_size
             self.num_hidden_layers = num_hidden_layers
             self.num_attention_heads = num_attention_heads
-            self.intermediate_size = intermediate_size
-            self.hidden_act = hidden_act
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.type_sequence_label_size = type_sequence_label_size
-            self.initializer_range = initializer_range
-            self.num_labels = num_labels
-            self.num_choices = num_choices
+            self.d_ff = d_ff
+            self.relative_attention_num_buckets = relative_attention_num_buckets
+            self.dropout_rate = dropout_rate
+            self.initializer_factor = initializer_factor
             self.scope = scope
 
         def prepare_config_and_inputs(self):
@@ -93,61 +82,53 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
             if self.use_input_mask:
                 input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
 
-            token_type_ids = None
-            if self.use_token_type_ids:
-                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
-
-            sequence_labels = None
             token_labels = None
-            choice_labels = None
             if self.use_labels:
-                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
-                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             config = T5Config(
                 vocab_size_or_config_json_file=self.vocab_size,
-                hidden_size=self.hidden_size,
-                num_hidden_layers=self.num_hidden_layers,
-                num_attention_heads=self.num_attention_heads,
-                intermediate_size=self.intermediate_size,
-                hidden_act=self.hidden_act,
-                hidden_dropout_prob=self.hidden_dropout_prob,
-                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
-                max_position_embeddings=self.max_position_embeddings,
-                type_vocab_size=self.type_vocab_size,
-                initializer_range=self.initializer_range)
+                n_positions=self.n_positions,
+                d_model=self.hidden_size,
+                d_ff=self.d_ff,
+                d_kv=self.hidden_size // self.num_attention_heads,
+                num_layers=self.num_hidden_layers,
+                num_heads=self.num_attention_heads,
+                relative_attention_num_buckets=self.relative_attention_num_buckets,
+                dropout_rate=self.dropout_rate,
+                initializer_factor=self.initializer_factor)
 
-            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+            return (config, input_ids, input_mask, token_labels)
 
-        def create_and_check_t5_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
             model = TFT5Model(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            sequence_output, pooled_output = model(inputs)
+            inputs = {'encoder_input_ids': input_ids,
+                      'decoder_input_ids': input_ids,
+                      'decoder_attention_mask': input_mask}
+            encoder_output, decoder_output = model(inputs)
 
-            inputs = [input_ids, input_mask]
-            sequence_output, pooled_output = model(inputs)
-
-            sequence_output, pooled_output = model(input_ids)
+            encoder_output, decoder_output = model(input_ids,
+                                                   decoder_attention_mask=input_mask,
+                                                   encoder_input_ids=input_ids)
 
             result = {
-                "sequence_output": sequence_output.numpy(),
-                "pooled_output": pooled_output.numpy(),
+                "encoder_output": encoder_output.numpy(),
+                "decoder_output": decoder_output.numpy(),
             }
             self.parent.assertListEqual(
-                list(result["sequence_output"].shape),
+                list(result["encoder_output"].shape),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(
+                list(result["decoder_output"].shape),
                 [self.batch_size, self.seq_length, self.hidden_size])
-            self.parent.assertListEqual(list(result["pooled_output"].shape), [self.batch_size, self.hidden_size])
 
 
-        def create_and_check_t5_with_lm_head(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels):
+        def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
             model = TFT5WithLMHeadModel(config=config)
-            inputs = {'input_ids': input_ids,
-                      'attention_mask': input_mask,
-                      'token_type_ids': token_type_ids}
-            prediction_scores, = model(inputs)
+            inputs = {'encoder_input_ids': input_ids,
+                      'decoder_input_ids': input_ids,
+                      'decoder_attention_mask': input_mask}
+            prediction_scores, decoder_output = model(inputs)
             result = {
                 "prediction_scores": prediction_scores.numpy(),
             }
@@ -158,14 +139,15 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, token_type_ids, input_mask,
-             sequence_labels, token_labels, choice_labels) = config_and_inputs
-            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            (config, input_ids, input_mask, token_labels) = config_and_inputs
+            inputs_dict = {'encoder_input_ids': input_ids,
+                           'decoder_input_ids': input_ids,
+                           'decoder_attention_mask': input_mask}
             return config, inputs_dict
 
     def setUp(self):
         self.model_tester = TFT5ModelTest.TFT5ModelTester(self)
-        self.config_tester = ConfigTester(self, config_class=T5Config, hidden_size=37)
+        self.config_tester = ConfigTester(self, config_class=T5Config, d_model=37)
 
     def test_config(self):
         self.config_tester.run_common_tests()
@@ -181,7 +163,7 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
     @pytest.mark.slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
-        for model_name in ['t5-base']:
+        for model_name in ['t5-small']:
             model = TFT5Model.from_pretrained(model_name, cache_dir=cache_dir)
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)

From 4321c541254bdabbda631520cff0a5a376ad9f48 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Nov 2019 11:49:32 +0100
Subject: [PATCH 010/110] fix tests

---
 transformers/tests/modeling_tf_common_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index 6c3954a088..83a15c137a 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -194,7 +194,7 @@ class TFCommonTestCases:
                 outputs_dict = model(inputs_dict)
 
                 inputs_keywords = copy.deepcopy(inputs_dict)
-                input_ids = inputs_keywords.pop('input_ids', inputs_keywords.pop('decoder_input_ids'))
+                input_ids = inputs_keywords.pop('input_ids' if not self.is_encoder_decoder else 'decoder_input_ids', None)
                 outputs_keywords = model(input_ids, **inputs_keywords)
 
                 output_dict = outputs_dict[0].numpy()

From f03c0c1423d4635f3e71a6c24053f01f6f02063c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Nov 2019 11:49:46 +0100
Subject: [PATCH 011/110] adding models in readme and auto classes

---
 README.md                                     |  3 ++-
 docs/source/pretrained_models.rst             | 20 +++++++++++++++++++
 transformers/__main__.py                      | 18 +++++++++++++++++
 .../convert_pytorch_checkpoint_to_tf2.py      | 13 ++++++++----
 transformers/modeling_auto.py                 | 13 ++++++++++--
 transformers/modeling_tf_auto.py              | 13 ++++++++++--
 transformers/tokenization_auto.py             |  7 ++++++-
 7 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index 40b08583b1..d6f6e426d8 100644
--- a/README.md
+++ b/README.md
@@ -122,7 +122,8 @@ At some point in the future, you'll be able to seamlessly move from pre-training
 7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), released together with the paper a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.
 8. **[DistilBERT](https://github.com/huggingface/transformers/tree/master/examples/distillation)** (from HuggingFace), released together with the paper [DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter](https://arxiv.org/abs/1910.01108) by Victor Sanh, Lysandre Debut and Thomas Wolf. The same method has been applied to compress GPT2 into [DistilGPT2](https://github.com/huggingface/transformers/tree/master/examples/distillation).
 9. **[CTRL](https://github.com/salesforce/ctrl/)** (from Salesforce) released with the paper [CTRL: A Conditional Transformer Language Model for Controllable Generation](https://arxiv.org/abs/1909.05858) by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
-10. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+10. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
+11. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
 
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
 
diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index 43c08228bd..c6240dc850 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -144,5 +144,25 @@ Here is the full list of the currently provided pretrained models together with
 | CTRL              | ``ctrl``                                                   | | 48-layer, 1280-hidden, 16-heads, 1.6B parameters                                                                                    |
 |                   |                                                            | | Salesforce's Large-sized CTRL English model                                                                                         |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| T5                | ``t5-small``                                               | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
+|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint                                                   |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``t5-base``                                                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
+|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``t5-large``                                               | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
+|                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``t5-3b``                                                  | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
+|                   |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``t5-11b``                                                 | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
+|                   |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
+|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 
 .. <https://huggingface.co/transformers/examples.html>`__
\ No newline at end of file
diff --git a/transformers/__main__.py b/transformers/__main__.py
index 31dbd24908..6136d768f6 100644
--- a/transformers/__main__.py
+++ b/transformers/__main__.py
@@ -6,6 +6,7 @@ def main():
         "This command line utility let you convert original (author released) model checkpoint to pytorch.\n"
         "It should be used as one of: \n"
         ">> transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
+        ">> transformers t5 TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT, \n"
         ">> transformers gpt OPENAI_GPT_CHECKPOINT_FOLDER_PATH PYTORCH_DUMP_OUTPUT [OPENAI_GPT_CONFIG], \n"
         ">> transformers transfo_xl TF_CHECKPOINT_OR_DATASET PYTORCH_DUMP_OUTPUT [TF_CONFIG] or \n"
         ">> transformers gpt2 TF_CHECKPOINT PYTORCH_DUMP_OUTPUT [GPT2_CONFIG] or \n"
@@ -21,6 +22,23 @@ def main():
                     "https://www.tensorflow.org/install/ for installation instructions.")
                 raise
 
+            if len(sys.argv) != 5:
+                # pylint: disable=line-too-long
+                print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
+            else:
+                PYTORCH_DUMP_OUTPUT = sys.argv.pop()
+                TF_CONFIG = sys.argv.pop()
+                TF_CHECKPOINT = sys.argv.pop()
+                convert_tf_checkpoint_to_pytorch(TF_CHECKPOINT, TF_CONFIG, PYTORCH_DUMP_OUTPUT)
+        elif sys.argv[1] == "t5":
+            try:
+                from .convert_t5_original_tf_checkpoint_to_pytorch import convert_tf_checkpoint_to_pytorch
+            except ImportError:
+                print("transformers can only be used from the commandline to convert TensorFlow models in PyTorch, "
+                    "In that case, it requires TensorFlow to be installed. Please see "
+                    "https://www.tensorflow.org/install/ for installation instructions.")
+                raise
+
             if len(sys.argv) != 5:
                 # pylint: disable=line-too-long
                 print("Should be used as `transformers bert TF_CHECKPOINT TF_CONFIG PYTORCH_DUMP_OUTPUT`")
diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index e673b77dcc..19629172ff 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -33,7 +33,8 @@ from transformers import (load_pytorch_checkpoint_in_tf2_model,
                                   OpenAIGPTConfig, TFOpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                   RobertaConfig, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
                                   DistilBertConfig, TFDistilBertForMaskedLM, TFDistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-                                  CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP)
+                                  CTRLConfig, TFCTRLLMHeadModel, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+                                  T5Config, TFT5WithLMHeadModel, T5_PRETRAINED_CONFIG_ARCHIVE_MAP)
 
 if is_torch_available():
     import torch
@@ -46,7 +47,8 @@ if is_torch_available():
                                       OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
                                       RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
                                       DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-                                      CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP)
+                                      CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+                                      T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP)
 else:
     (BertForPreTraining, BertForQuestionAnswering, BertForSequenceClassification, BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
     GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
@@ -56,7 +58,8 @@ else:
     OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
     RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
     DistilBertForMaskedLM, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-    CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP) = (
+    CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+    T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP) = (
         None, None, None, None,
         None, None,
         None, None,
@@ -65,6 +68,7 @@ else:
         None, None,
         None, None, None,
         None, None, None,
+        None, None,
         None, None)
 
 
@@ -85,7 +89,8 @@ MODEL_CLASSES = {
     'roberta-large-mnli': (RobertaConfig, TFRobertaForSequenceClassification, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP),
     'distilbert': (DistilBertConfig, TFDistilBertForMaskedLM, DistilBertForMaskedLM, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
     'distilbert-base-uncased-distilled-squad': (DistilBertConfig, TFDistilBertForQuestionAnswering, DistilBertForQuestionAnswering, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP),
-    'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP)
+    'ctrl': (CTRLConfig, TFCTRLLMHeadModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP),
+    't5': (T5Config, TFT5WithLMHeadModel, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP, T5_PRETRAINED_CONFIG_ARCHIVE_MAP),
 }
 
 def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file, tf_dump_path, compare_with_pt_model=False, use_cached_models=True):
diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py
index d98110d4bd..a2129176d3 100644
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -27,6 +27,7 @@ from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassi
 from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
 from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
 from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
+from .modeling_t5 import T5Model, T5WithLMHeadModel
 
 from .modeling_utils import PreTrainedModel, SequenceSummary
 
@@ -47,6 +48,7 @@ class AutoModel(object):
 
         The base model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Model (T5 model)
             - contains `distilbert`: DistilBertModel (DistilBERT model)
             - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
@@ -70,6 +72,7 @@ class AutoModel(object):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Model (T5 model)
             - contains `distilbert`: DistilBertModel (DistilBERT model)
             - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
@@ -136,7 +139,9 @@ class AutoModel(object):
             model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
             return DistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
@@ -171,6 +176,7 @@ class AutoModelWithLMHead(object):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5ModelWithLMHead (T5 model)
             - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
             - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
             - contains `bert`: BertForMaskedLM (Bert model)
@@ -197,6 +203,7 @@ class AutoModelWithLMHead(object):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5ModelWithLMHead (T5 model)
             - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
             - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
             - contains `bert`: BertForMaskedLM (Bert model)
@@ -262,7 +269,9 @@ class AutoModelWithLMHead(object):
             model = AutoModelWithLMHead.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
             return DistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py
index df0ad6e401..b24623dcdc 100644
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -27,6 +27,7 @@ from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceC
 from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification
 from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification
 from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel
+from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel
 
 from .file_utils import add_start_docstrings
 
@@ -45,6 +46,7 @@ class TFAutoModel(object):
 
         The base model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5Model (T5 model)
             - contains `distilbert`: TFDistilBertModel (DistilBERT model)
             - contains `roberta`: TFRobertaModel (RoBERTa model)
             - contains `bert`: TFBertModel (Bert model)
@@ -68,6 +70,7 @@ class TFAutoModel(object):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5Model (T5 model)
             - contains `distilbert`: TFDistilBertModel (DistilBERT model)
             - contains `roberta`: TFRobertaModel (RoBERTa model)
             - contains `bert`: TFTFBertModel (Bert model)
@@ -133,7 +136,9 @@ class TFAutoModel(object):
             model = TFAutoModel.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
             return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
@@ -169,6 +174,7 @@ class TFAutoModelWithLMHead(object):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5WithLMHeadModel (T5 model)
             - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
             - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
             - contains `bert`: TFBertForMaskedLM (Bert model)
@@ -195,6 +201,7 @@ class TFAutoModelWithLMHead(object):
 
         The model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: TFT5WithLMHeadModel (T5 model)
             - contains `distilbert`: TFDistilBertForMaskedLM (DistilBERT model)
             - contains `roberta`: TFRobertaForMaskedLM (RoBERTa model)
             - contains `bert`: TFBertForMaskedLM (Bert model)
@@ -261,7 +268,9 @@ class TFAutoModelWithLMHead(object):
             model = TFAutoModelWithLMHead.from_pretrained('./pt_model/bert_pytorch_model.bin', from_pt=True, config=config)
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
             return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py
index ec056de17f..5be2562448 100644
--- a/transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -27,6 +27,7 @@ from .tokenization_xlnet import XLNetTokenizer
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
 from .tokenization_distilbert import DistilBertTokenizer
+from .tokenization_t5 import T5Tokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -41,6 +42,7 @@ class AutoTokenizer(object):
 
         The tokenizer class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Tokenizer (T5 model)
             - contains `distilbert`: DistilBertTokenizer (DistilBert model)
             - contains `roberta`: RobertaTokenizer (RoBERTa model)
             - contains `bert`: BertTokenizer (Bert model)
@@ -64,6 +66,7 @@ class AutoTokenizer(object):
 
         The tokenizer class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
+            - contains `t5`: T5Tokenizer (T5 model)
             - contains `distilbert`: DistilBertTokenizer (DistilBert model)
             - contains `roberta`: RobertaTokenizer (XLM model)
             - contains `bert`: BertTokenizer (Bert model)
@@ -101,7 +104,9 @@ class AutoTokenizer(object):
             tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
 
         """
-        if 'distilbert' in pretrained_model_name_or_path:
+        if 't5' in pretrained_model_name_or_path:
+            return T5Tokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'distilbert' in pretrained_model_name_or_path:
             return DistilBertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

From 15e53c4e8712260b016225310c397e19a5f7b21c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Nov 2019 12:43:21 +0100
Subject: [PATCH 012/110] maybe fix tests

---
 transformers/tests/modeling_tf_common_test.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index 83a15c137a..20ccfd8ce0 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -131,7 +131,11 @@ class TFCommonTestCases:
                 with torch.no_grad():
                     pto = pt_model(**pt_inputs_dict)
                 tfo = tf_model(inputs_dict)
-                max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
+                tfo = tfo[0].numpy()
+                pto = pto[0].numpy()
+                tfo[np.isnan(tfo)] = 0
+                pto[np.isnan(pto)] = 0
+                max_diff = np.amax(np.abs(tfo - pto))
                 self.assertLessEqual(max_diff, 2e-2)
 
                 # Check we can load pt model in tf and vice-versa with checkpoint => model functions
@@ -151,7 +155,11 @@ class TFCommonTestCases:
                 with torch.no_grad():
                     pto = pt_model(**pt_inputs_dict)
                 tfo = tf_model(inputs_dict)
-                max_diff = np.amax(np.abs(tfo[0].numpy() - pto[0].numpy()))
+                tfo = tfo[0].numpy()
+                pto = pto[0].numpy()
+                tfo[np.isnan(tfo)] = 0
+                pto[np.isnan(pto)] = 0
+                max_diff = np.amax(np.abs(tfo - pto))
                 self.assertLessEqual(max_diff, 2e-2)
 
         def test_compile_tf_model(self):

From b4fcd59a5ae8d12102db106d3b03849ef86109bd Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Nov 2019 14:38:53 +0100
Subject: [PATCH 013/110] add sentinels in tokenizer

---
 transformers/tokenization_t5.py | 31 ++++++++++++++++++++++++++-----
 1 file changed, 26 insertions(+), 5 deletions(-)

diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py
index 93842d29f0..3847aeefbf 100644
--- a/transformers/tokenization_t5.py
+++ b/transformers/tokenization_t5.py
@@ -18,6 +18,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 import os
+import re
 import six
 from shutil import copyfile
 
@@ -31,7 +32,7 @@ SPIECE_UNDERLINE = u'▁'
 # Mapping from the keyword arguments names of Tokenizer `__init__`
 # to file names for serializing Tokenizer instances
 ####################################################
-VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
 
 ####################################################
 # Mapping from the keyword arguments names of Tokenizer `__init__`
@@ -56,15 +57,27 @@ class T5Tokenizer(PreTrainedTokenizer):
         SentencePiece based tokenizer. Peculiarities:
 
             - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+            - `extra_ids` add a number of extra ids added to the end of the vocabulary for use as sentinels.
+                These tokens are accessible as `<extra_id_{%d}>` where `{%d}` is a number between 0 and extra_ids-1.
+                Extra tokens are indexed from the end of the vocabulary up to beginnning (<extra_id_0> is the last token in the vocabulary)
+                (like in T5 preprocessing
+                see: https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)
     """
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
     def __init__(self, vocab_file, eos_token="</s>", unk_token="<unk>",
-                 pad_token="<pad>", **kwargs):
+                 pad_token="<pad>", extra_ids=100, additional_special_tokens=None, **kwargs):
+        # Add extra_ids to the special token list
+        if extra_ids > 0:
+            if additional_special_tokens is None:
+                additional_special_tokens = []
+            additional_special_tokens.extend([u"<extra_id_{}>".format(i) for i in range(extra_ids)])
+
         super(T5Tokenizer, self).__init__(eos_token=eos_token, unk_token=unk_token,
-                                          pad_token=pad_token, **kwargs)
+                                          pad_token=pad_token, additional_special_tokens=additional_special_tokens,
+                                          **kwargs)
 
         try:
             import sentencepiece as spm
@@ -74,13 +87,14 @@ class T5Tokenizer(PreTrainedTokenizer):
                            "pip install sentencepiece")
 
         self.vocab_file = vocab_file
+        self._extra_ids = extra_ids
 
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(vocab_file)
 
     @property
     def vocab_size(self):
-        return self.sp_model.get_piece_size()
+        return self.sp_model.get_piece_size() + self._extra_ids
 
     def __getstate__(self):
         state = self.__dict__.copy()
@@ -118,11 +132,18 @@ class T5Tokenizer(PreTrainedTokenizer):
 
     def _convert_token_to_id(self, token):
         """ Converts a token (str/unicode) in an id using the vocab. """
+        if token.startswith(u"<extra_id_"):
+            l = re.match(r'<extra_id_(\d+)>', token)
+            num = int(l[1])
+            return self.vocab_size - num - 1
         return self.sp_model.piece_to_id(token)
 
     def _convert_id_to_token(self, index, return_unicode=True):
         """Converts an index (integer) in a token (string/unicode) using the vocab."""
-        token = self.sp_model.IdToPiece(index)
+        if index < self.sp_model.get_piece_size():
+            token = self.sp_model.IdToPiece(index)
+        else:
+            token = u"<extra_id_{}>".format(self.vocab_size - 1 - index)
         if six.PY2 and return_unicode and isinstance(token, str):
             token = token.decode('utf-8')
         return token

From 268d4f2099f90bb62949988c3b78596242e1d753 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 8 Nov 2019 16:41:55 +0100
Subject: [PATCH 014/110] fix position biases + better tests

---
 transformers/modeling_t5.py            | 11 +++--
 transformers/tests/modeling_t5_test.py | 62 +++++++++++++++-----------
 2 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index 6be0ae6863..2a74333d31 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -408,7 +408,7 @@ class T5Block(nn.Module):
                                                 position_bias=position_bias,
                                                 head_mask=head_mask)
         hidden_states = self_attention_outputs[0]
-        outputs = self_attention_outputs[1:]
+        outputs = self_attention_outputs[1:]  # Keep self-attention outputs and relative position weights
 
         if not self.is_decoder:
             hidden_states = self.layer[1](hidden_states)
@@ -419,11 +419,11 @@ class T5Block(nn.Module):
                                                     position_bias=encoder_decoder_position_bias,
                                                     head_mask=head_mask)
             hidden_states = cross_attention_outputs[0]
-            outputs = cross_attention_outputs[1:] + outputs
+            outputs = outputs + cross_attention_outputs[1:]  # Keep cross-attention outputs and relative position weights
             hidden_states = self.layer[2](hidden_states)
 
         outputs = (hidden_states,) + outputs  # add attentions if we output them
-        return outputs
+        return outputs  # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
 
 
 class T5PreTrainedModel(PreTrainedModel):
@@ -564,14 +564,17 @@ class T5Stack(T5PreTrainedModel):
                                          encoder_attention_mask=encoder_extended_attention_mask,
                                          encoder_decoder_position_bias=encoder_decoder_position_bias,
                                          head_mask=head_mask[i])
+            # layer_outputs is a tuple with:
+            # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
             hidden_states = layer_outputs[0]
             if i == 0:
+                # We share the position biases between the layers - the first layer store them
                 position_bias = layer_outputs[2 if self.output_attentions else 1]
                 if self.is_decoder:
                     encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
 
             if self.output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
+                all_attentions = all_attentions + (layer_outputs[1],)  # We keep only self-attention weights for now
 
         hidden_states = self.final_layer_norm(hidden_states)
         layer_output = self.dropout(hidden_states)
diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py
index 2c67b83c25..091bd742b5 100644
--- a/transformers/tests/modeling_t5_test.py
+++ b/transformers/tests/modeling_t5_test.py
@@ -45,9 +45,10 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
         def __init__(self,
                      parent,
                      batch_size=13,
-                     seq_length=7,
+                     encoder_seq_length=7,
+                     decoder_seq_length=9,
                      is_training=True,
-                     use_input_mask=True,
+                     use_attention_mask=True,
                      use_labels=True,
                      vocab_size=99,
                      n_positions=14,
@@ -62,9 +63,10 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
                     ):
             self.parent = parent
             self.batch_size = batch_size
-            self.seq_length = seq_length
+            self.encoder_seq_length = encoder_seq_length
+            self.decoder_seq_length = decoder_seq_length
             self.is_training = is_training
-            self.use_input_mask = use_input_mask
+            self.use_attention_mask = use_attention_mask
             self.use_labels = use_labels
             self.vocab_size = vocab_size
             self.n_positions = n_positions
@@ -78,15 +80,18 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
             self.scope = scope
 
         def prepare_config_and_inputs(self):
-            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+            encoder_input_ids = ids_tensor([self.batch_size, self.encoder_seq_length], self.vocab_size)
+            decoder_input_ids = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
 
-            input_mask = None
-            if self.use_input_mask:
-                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+            encoder_attention_mask = None
+            decoder_attention_mask = None
+            if self.use_attention_mask:
+                encoder_attention_mask = ids_tensor([self.batch_size, self.encoder_seq_length], vocab_size=2)
+                decoder_attention_mask = ids_tensor([self.batch_size, self.decoder_seq_length], vocab_size=2)
 
-            token_labels = None
+            decoder_lm_labels = None
             if self.use_labels:
-                token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+                decoder_lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
 
             config = T5Config(
                 vocab_size_or_config_json_file=self.vocab_size,
@@ -100,21 +105,22 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
                 dropout_rate=self.dropout_rate,
                 initializer_factor=self.initializer_factor)
 
-            return (config, input_ids, input_mask, token_labels)
+            return (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels)
 
         def check_loss_output(self, result):
             self.parent.assertListEqual(
                 list(result["loss"].size()),
                 [])
 
-        def create_and_check_t5_model(self, config, input_ids, input_mask, token_labels):
+        def create_and_check_t5_model(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
             model = T5Model(config=config)
             model.eval()
-            encoder_output, decoder_output = model(encoder_input_ids=input_ids,
-                                                   decoder_input_ids=input_ids,
-                                                   decoder_attention_mask=input_mask)
-            encoder_output, decoder_output = model(encoder_input_ids=input_ids,
-                                                   decoder_input_ids=input_ids)
+            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
+                                                   decoder_input_ids=decoder_input_ids,
+                                                   encoder_attention_mask=encoder_attention_mask,
+                                                   decoder_attention_mask=decoder_attention_mask)
+            decoder_output, encoder_output = model(encoder_input_ids=encoder_input_ids,
+                                                   decoder_input_ids=decoder_input_ids)
 
             result = {
                 "encoder_output": encoder_output,
@@ -122,17 +128,17 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
             }
             self.parent.assertListEqual(
                 list(result["encoder_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                [self.batch_size, self.encoder_seq_length, self.hidden_size])
             self.parent.assertListEqual(
                 list(result["decoder_output"].size()),
-                [self.batch_size, self.seq_length, self.hidden_size])
+                [self.batch_size, self.decoder_seq_length, self.hidden_size])
 
 
-        def create_and_check_t5_with_lm_head(self, config, input_ids, input_mask, token_labels):
+        def create_and_check_t5_with_lm_head(self, config, encoder_input_ids, decoder_input_ids, encoder_attention_mask, decoder_attention_mask, decoder_lm_labels):
             model = T5WithLMHeadModel(config=config)
             model.eval()
-            outputs = model(encoder_input_ids=input_ids, decoder_input_ids=input_ids,
-                            decoder_attention_mask=input_mask, decoder_lm_labels=token_labels)
+            outputs = model(encoder_input_ids=encoder_input_ids, decoder_input_ids=decoder_input_ids,
+                            decoder_attention_mask=decoder_attention_mask, decoder_lm_labels=decoder_lm_labels)
             loss, prediction_scores = outputs[0], outputs[1]
             result = {
                 "loss": loss,
@@ -140,15 +146,17 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
             }
             self.parent.assertListEqual(
                 list(result["prediction_scores"].size()),
-                [self.batch_size, self.seq_length, self.vocab_size])
+                [self.batch_size, self.decoder_seq_length, self.vocab_size])
             self.check_loss_output(result)
 
         def prepare_config_and_inputs_for_common(self):
             config_and_inputs = self.prepare_config_and_inputs()
-            (config, input_ids, input_mask, token_labels) = config_and_inputs
-            inputs_dict = {'encoder_input_ids': input_ids,
-                           'decoder_input_ids': input_ids,
-                           'decoder_attention_mask': input_mask}
+            (config, encoder_input_ids, decoder_input_ids, encoder_attention_mask,
+             decoder_attention_mask, decoder_lm_labels) = config_and_inputs
+            inputs_dict = {'encoder_input_ids': encoder_input_ids,
+                           'decoder_input_ids': decoder_input_ids,
+                           'decoder_attention_mask': decoder_attention_mask,
+                           'encoder_attention_mask': encoder_attention_mask}
             return config, inputs_dict
 
     def setUp(self):

From f3776df0f3daca86634862fe3ba7da6ae2b9a663 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 2 Dec 2019 15:47:00 +0100
Subject: [PATCH 015/110] WIP debugging

---
 transformers/modeling_t5.py | 61 +++++++++++++++++++++++++++----------
 1 file changed, 45 insertions(+), 16 deletions(-)

diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index 2a74333d31..1bf55611a2 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -132,6 +132,21 @@ def load_tf_weights_in_t5(model, config, tf_checkpoint_path):
 # - PreTrainedModel for the models (it-self a sub-class of torch.nn.Module)
 ####################################################
 
+class T5LayerNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """ Construct a layernorm module in the T5 style
+            No bias and no substraction of mean.
+        """
+        super(T5LayerNorm, self).__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, x):
+        variance = x.pow(2).mean(-1, keepdim=True)
+        x = x / torch.sqrt(variance + self.variance_epsilon)
+        return self.weight * x
+
+
 class T5DenseReluDense(nn.Module):
     def __init__(self, config):
         super(T5DenseReluDense, self).__init__()
@@ -151,7 +166,7 @@ class T5LayerFF(nn.Module):
     def __init__(self, config):
         super(T5LayerFF, self).__init__()
         self.DenseReluDense = T5DenseReluDense(config)
-        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
         self.dropout = nn.Dropout(config.dropout_rate)
 
     def forward(self, hidden_states):
@@ -316,13 +331,14 @@ class T5Attention(nn.Module):
             cache[self.layer_id] = (k, v)
 
         # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
-        scores = torch.matmul(q, k.transpose(2, 3))                           # (bs, n_heads, qlen, klen)
+        scores = torch.einsum('bnqd,bnkd->bnqk', q, k)                        # (bs, n_heads, qlen, klen)
 
         if position_bias is None:
             if not self.has_relative_attention_bias:
                 raise ValueError("No position_bias provided and no weights to compute position_bias")
             position_bias = self.compute_bias(qlen, klen)
         scores += position_bias
+        special_out = position_bias
 
         if mask is not None:
             scores += mask
@@ -346,14 +362,14 @@ class T5Attention(nn.Module):
             outputs = outputs + (weights,)
         if self.has_relative_attention_bias:
             outputs = outputs + (position_bias,)
-        return outputs
+        return outputs + (special_out,)
 
 
 class T5LayerSelfAttention(nn.Module):
     def __init__(self, config, has_relative_attention_bias=False):
         super(T5LayerSelfAttention, self).__init__()
         self.SelfAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
         self.dropout = nn.Dropout(config.dropout_rate)
 
     def forward(self, hidden_states, attention_mask=None, position_bias=None, head_mask=None):
@@ -363,16 +379,18 @@ class T5LayerSelfAttention(nn.Module):
                                               position_bias=position_bias,
                                               head_mask=head_mask)
         y = attention_output[0]
+        special_out = attention_output[-1]
+        attention_output = attention_output[:-1]
         layer_output = hidden_states + self.dropout(y)
         outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
-        return outputs
+        return outputs + (special_out,)
 
 
 class T5LayerCrossAttention(nn.Module):
     def __init__(self, config, has_relative_attention_bias=False):
         super(T5LayerCrossAttention, self).__init__()
         self.EncDecAttention = T5Attention(config, has_relative_attention_bias=has_relative_attention_bias)
-        self.layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
         self.dropout = nn.Dropout(config.dropout_rate)
 
     def forward(self, hidden_states, kv, attention_mask=None, position_bias=None, head_mask=None):
@@ -408,7 +426,8 @@ class T5Block(nn.Module):
                                                 position_bias=position_bias,
                                                 head_mask=head_mask)
         hidden_states = self_attention_outputs[0]
-        outputs = self_attention_outputs[1:]  # Keep self-attention outputs and relative position weights
+        special_out = self_attention_outputs[-1]
+        outputs = self_attention_outputs[1:-1]  # Keep self-attention outputs and relative position weights
 
         if not self.is_decoder:
             hidden_states = self.layer[1](hidden_states)
@@ -423,7 +442,7 @@ class T5Block(nn.Module):
             hidden_states = self.layer[2](hidden_states)
 
         outputs = (hidden_states,) + outputs  # add attentions if we output them
-        return outputs  # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+        return outputs + (special_out,)  # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
 
 
 class T5PreTrainedModel(PreTrainedModel):
@@ -438,8 +457,7 @@ class T5PreTrainedModel(PreTrainedModel):
     def _init_weights(self, module):
         """ Initialize the weights """
         factor = self.config.initializer_factor  # Used for testing weights initialization
-        if isinstance(module, nn.LayerNorm):
-            module.bias.data.zero_()
+        if isinstance(module, T5LayerNorm):
             module.weight.data.fill_(factor*1.0)
         elif isinstance(module, (T5Model, T5WithLMHeadModel)):
             # Mesh TensorFlow embeddings initialization
@@ -478,7 +496,7 @@ class T5Stack(T5PreTrainedModel):
 
         self.block = nn.ModuleList([T5Block(config, has_relative_attention_bias=bool(i == 0))
                                     for i in range(config.num_layers)])
-        self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+        self.final_layer_norm = T5LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
         self.dropout = nn.Dropout(config.dropout_rate)
 
         self.init_weights()
@@ -515,11 +533,11 @@ class T5Stack(T5PreTrainedModel):
 
         # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
         # masked positions, this operation will create a tensor which is 0.0 for
-        # positions we want to attend and -10000.0 for masked positions.
+        # positions we want to attend and -1e9 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
 
         if self.is_decoder:
             # If a 2D ou 3D attention mask is provided for the cross-attention
@@ -530,7 +548,7 @@ class T5Stack(T5PreTrainedModel):
                 encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
 
             encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
-            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
         else:
             encoder_extended_attention_mask = None
 
@@ -553,6 +571,8 @@ class T5Stack(T5PreTrainedModel):
         all_attentions = ()
         position_bias = None
         encoder_decoder_position_bias = None
+
+        hidden_states = self.dropout(hidden_states)
         for i, layer_module in enumerate(self.block):
             if self.output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
@@ -564,6 +584,8 @@ class T5Stack(T5PreTrainedModel):
                                          encoder_attention_mask=encoder_extended_attention_mask,
                                          encoder_decoder_position_bias=encoder_decoder_position_bias,
                                          head_mask=head_mask[i])
+            if i == 0:
+                special_out = layer_outputs[-1]
             # layer_outputs is a tuple with:
             # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
             hidden_states = layer_outputs[0]
@@ -588,7 +610,7 @@ class T5Stack(T5PreTrainedModel):
             outputs = outputs + (all_hidden_states,)
         if self.output_attentions:
             outputs = outputs + (all_attentions,)
-        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
+        return outputs + (special_out,)  # last-layer hidden state, (all hidden states), (all attentions)
 
 
 T5_START_DOCSTRING = r"""    The T5 model was proposed in
@@ -707,9 +729,16 @@ class T5Model(T5PreTrainedModel):
 
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
+        encoder_attention_mask = kwargs_encoder.get("attention_mask", None)
         if encoder_hidden_states is None:
             encoder_inputs_ids = kwargs_encoder.pop("input_ids")
             hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+
+            if encoder_attention_mask is not None:
+                # Apply masking
+                encoder_attention_mask = (encoder_attention_mask != 0).to(hidden_states)
+                hidden_states = hidden_states * encoder_attention_mask.unsqueeze(-1)
+
             encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
             encoder_hidden_states = encoder_outputs[0]
         else:
@@ -719,7 +748,7 @@ class T5Model(T5PreTrainedModel):
         decoder_inputs_ids = kwargs_decoder.pop("input_ids")
         hidden_states = self.shared(decoder_inputs_ids)  # Convert inputs in embeddings
         kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
-        kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
+        kwargs_decoder["encoder_attention_mask"] = encoder_attention_mask
         decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
 
         return decoder_outputs + encoder_outputs

From 7f998b1b832dd69cfdd8455afd5b8af3b2f77df8 Mon Sep 17 00:00:00 2001
From: Guillaume B <guillaume.becquin@gmail.com>
Date: Thu, 5 Dec 2019 08:57:49 +0100
Subject: [PATCH 016/110] special_tokens_mask value was unused and calculated
 twice

---
 transformers/tokenization_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 5d683629f0..6be96989cb 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -910,7 +910,7 @@ class PreTrainedTokenizer(object):
             token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
             special_tokens_mask = [0] * (len(ids) + (len(pair_ids) if pair else 0))
         if return_special_tokens_mask:
-            encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+            encoded_inputs["special_tokens_mask"] = special_tokens_mask
 
         # Prepare inputs as tensors if asked
         if return_tensors == 'tf' and is_tf_available():

From f8fb4335c9cd79789ed6119e729348e0a1b51e2b Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 5 Dec 2019 15:19:32 +0100
Subject: [PATCH 017/110] clean up a little bit PT <=> TF conversion

---
 transformers/convert_pytorch_checkpoint_to_tf2.py | 9 +++++----
 transformers/modeling_utils.py                    | 3 ++-
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index d1776e9c14..d20eafe2e9 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -119,10 +119,11 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
         tf_inputs = tf.constant(inputs_list)
         tfo = tf_model(tf_inputs, training=False)  # build the network
 
-        pt_model = pt_model_class.from_pretrained(None,
-                                                  config=config,
-                                                  state_dict=torch.load(pytorch_checkpoint_path,
-                                                                        map_location='cpu'))
+        pt_model = pt_model_class(config)
+        pt_model.load_state_dict(torch.load(pytorch_checkpoint_path, map_location='cpu'),
+                                 strict-False)
+        pt_model.eval()
+
         pt_inputs = torch.tensor(inputs_list)
         with torch.no_grad():
             pto = pt_model(pt_inputs)
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 398172a88c..3ac568771e 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -318,7 +318,8 @@ class PreTrainedModel(nn.Module):
             model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path:
+        if pretrained_model_name_or_path is not None and (
+                "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path):
             logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
                            "https://github.com/google-research/google-research/issues/119 for more information.")
 

From f230d91b437c806e3e2dad37318a5ce77d208fa6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Thu, 5 Dec 2019 21:24:57 +0100
Subject: [PATCH 018/110] check the validity of links

We add a script and a CI workflow to check that all download links
present in the source code are valid.
---
 .circleci/config.yml | 11 ++++++
 utils/link_tester.py | 79 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)
 create mode 100644 utils/link_tester.py

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 01e6d82b33..ebfbd79b93 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -82,6 +82,16 @@ jobs:
             - run: sudo pip install --progress-bar off -r docs/requirements.txt
             - run: sudo pip install --progress-bar off -r requirements.txt
             - run: ./.circleci/deploy.sh
+    repository_consistency:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.5
+        resource_class: small
+        parallelism: 1
+        steps:
+            - checkout
+            - run: sudo pip install requests
+            - run: python ./utils/link_tester.py
 workflow_filters: &workflow_filters
     filters:
         branches:
@@ -91,6 +101,7 @@ workflows:
     version: 2
     build_and_test:
         jobs:
+            - repository_consistency
             - build_py3_torch_and_tf
             - build_py3_torch
             - build_py3_tf
diff --git a/utils/link_tester.py b/utils/link_tester.py
new file mode 100644
index 0000000000..fe3990d28c
--- /dev/null
+++ b/utils/link_tester.py
@@ -0,0 +1,79 @@
+""" Link tester.
+
+This little utility reads all the python files in the repository,
+scans for links pointing to S3 and tests the links one by one. Raises an error
+at the end of the scan if at least one link was reported broken.
+"""
+import os
+import re
+import sys
+
+import requests
+
+
+REGEXP_FIND_S3_LINKS = r"""([\"'])(https:\/\/s3)(.*)?\1"""
+
+
+def list_python_files_in_repository():
+    """ List all python files in the repository.
+
+    This function assumes that the script is executed in the root folder.
+    """
+    source_code_files = []
+    for path, subdirs, files in os.walk("."):
+        if "templates" in path:
+            continue
+        for name in files:
+            if ".py" in name and ".pyc" not in name:
+                path_to_files = os.path.join(path, name)
+                source_code_files.append(path_to_files)
+
+    return source_code_files
+
+
+def find_all_links(file_paths):
+    links = []
+    for path in file_paths:
+        links += scan_code_for_links(path)
+
+    return links
+
+
+def scan_code_for_links(source):
+    """ Scans the file to find links using a regular expression.
+    Returns a list of links.
+    """
+    with open(source, 'r') as content:
+        content = content.read()
+        raw_links = re.findall(REGEXP_FIND_S3_LINKS, content)
+        links = [prefix + suffix for _, prefix, suffix in raw_links]
+
+    return links
+
+
+def check_all_links(links):
+    """ Check that the provided links are valid.
+
+    Links are considered valid if a HEAD request to the server
+    returns a 200 status code.
+    """
+    broken_links = []
+    for link in links:
+        head = requests.head(link)
+        if head.status_code != 200:
+            broken_links.append(link)
+
+    return broken_links
+
+
+if __name__ == "__main__":
+    file_paths = list_python_files_in_repository()
+    links = find_all_links(file_paths)
+    broken_links = check_all_links(links)
+    print("Looking for broken links to pre-trained models/configs/tokenizers...")
+    if broken_links:
+        print("The following links did not respond:")
+        for link in broken_links:
+            print("- {}".format(link))
+        sys.exit(1)
+    print("All links are ok.")

From 21451ec6ba364de78c14e7d05a55913da2809844 Mon Sep 17 00:00:00 2001
From: Philipp Glock <pglock@users.noreply.github.com>
Date: Fri, 6 Dec 2019 10:32:43 +0100
Subject: [PATCH 019/110] handle string with only whitespaces as empty

---
 transformers/tokenization_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 5d683629f0..bc246cc8fe 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -634,7 +634,7 @@ class PreTrainedTokenizer(object):
             return result
 
         def split_on_tokens(tok_list, text):
-            if not text:
+            if not text.strip():
                 return []
             if not tok_list:
                 return self._tokenize(text, **kwargs)

From 1d87b37d100c69ff3b2c1a5dfd271b6cf777176e Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 6 Dec 2019 15:30:09 +0100
Subject: [PATCH 020/110] updating

---
 .../convert_pytorch_checkpoint_to_tf2.py      | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index d20eafe2e9..2c419888e8 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -119,10 +119,10 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
         tf_inputs = tf.constant(inputs_list)
         tfo = tf_model(tf_inputs, training=False)  # build the network
 
-        pt_model = pt_model_class(config)
-        pt_model.load_state_dict(torch.load(pytorch_checkpoint_path, map_location='cpu'),
-                                 strict-False)
-        pt_model.eval()
+        state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu')
+        pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None,
+                                                  config=config,
+                                                  state_dict=state_dict)
 
         pt_inputs = torch.tensor(inputs_list)
         with torch.no_grad():
@@ -140,7 +140,7 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
 
 
 def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortcut_names_or_path=None, config_shortcut_names_or_path=None,
-                                     compare_with_pt_model=False, use_cached_models=False, only_convert_finetuned_models=False):
+                                     compare_with_pt_model=False, use_cached_models=False, remove_cached_files=False, only_convert_finetuned_models=False):
     assert os.path.isdir(args.tf_dump_path), "--tf_dump_path should be a directory"
 
     if args_model_type is None:
@@ -188,13 +188,15 @@ def convert_all_pt_checkpoints_to_tf(args_model_type, tf_dump_path, model_shortc
 
             if os.path.isfile(model_shortcut_name):
                 model_shortcut_name = 'converted_model'
+
             convert_pt_checkpoint_to_tf(model_type=model_type,
                                         pytorch_checkpoint_path=model_file,
                                         config_file=config_file,
                                         tf_dump_path=os.path.join(tf_dump_path, model_shortcut_name + '-tf_model.h5'),
                                         compare_with_pt_model=compare_with_pt_model)
-            os.remove(config_file)
-            os.remove(model_file)
+            if remove_cached_files:
+                os.remove(config_file)
+                os.remove(model_file)
 
 
 if __name__ == "__main__":
@@ -227,6 +229,9 @@ if __name__ == "__main__":
     parser.add_argument("--use_cached_models",
                         action='store_true',
                         help = "Use cached models if possible instead of updating to latest checkpoint versions.")
+    parser.add_argument("--remove_cached_files",
+                        action='store_true',
+                        help = "Remove pytorch models after conversion (save memory when converting in batches).")
     parser.add_argument("--only_convert_finetuned_models",
                         action='store_true',
                         help = "Only convert finetuned models.")
@@ -246,4 +251,5 @@ if __name__ == "__main__":
                                         config_shortcut_names_or_path=[args.config_file] if args.config_file is not None else None,
                                         compare_with_pt_model=args.compare_with_pt_model,
                                         use_cached_models=args.use_cached_models,
+                                        remove_cached_files=args.remove_cached_files,
                                         only_convert_finetuned_models=args.only_convert_finetuned_models)

From 169fea6855741315e2e0e15881cefc9823803aa6 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 9 Dec 2019 16:25:33 +0100
Subject: [PATCH 021/110] updating T5

---
 transformers/modeling_t5.py | 31 +++++++++++++------------------
 1 file changed, 13 insertions(+), 18 deletions(-)

diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index 1bf55611a2..104e9060fc 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -281,7 +281,7 @@ class T5Attention(nn.Module):
         context_position = torch.arange(qlen, dtype=torch.long)[:, None]
         memory_position = torch.arange(klen, dtype=torch.long)[None, :]
         relative_position = memory_position - context_position  # shape (qlen, klen)
-        rp_bucket = self._relative_position_bucket(relative_position,
+        rp_bucket = self._relative_position_bucket(relative_position,  # shape (qlen, klen)
                                                    bidirectional=not self.is_decoder,
                                                    num_buckets=self.relative_attention_num_buckets)
         values = self.relative_attention_bias(rp_bucket)  # shape (qlen, klen, num_heads)
@@ -337,14 +337,10 @@ class T5Attention(nn.Module):
             if not self.has_relative_attention_bias:
                 raise ValueError("No position_bias provided and no weights to compute position_bias")
             position_bias = self.compute_bias(qlen, klen)
+            if mask is not None:
+                position_bias += mask                                         # (bs, n_heads, qlen, klen)
+
         scores += position_bias
-        special_out = position_bias
-
-        if mask is not None:
-            scores += mask
-            # mask = (mask == 0).expand_as(scores)                              # (bs, n_heads, qlen, klen)
-            # scores.masked_fill_(mask, -float('inf'))                          # (bs, n_heads, qlen, klen)
-
         weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
         weights = F.dropout(weights, p=self.dropout, training=self.training)  # (bs, n_heads, qlen, klen)
 
@@ -362,7 +358,7 @@ class T5Attention(nn.Module):
             outputs = outputs + (weights,)
         if self.has_relative_attention_bias:
             outputs = outputs + (position_bias,)
-        return outputs + (special_out,)
+        return outputs
 
 
 class T5LayerSelfAttention(nn.Module):
@@ -379,11 +375,9 @@ class T5LayerSelfAttention(nn.Module):
                                               position_bias=position_bias,
                                               head_mask=head_mask)
         y = attention_output[0]
-        special_out = attention_output[-1]
-        attention_output = attention_output[:-1]
         layer_output = hidden_states + self.dropout(y)
         outputs = (layer_output,) + attention_output[1:]  # add attentions if we output them
-        return outputs + (special_out,)
+        return outputs
 
 
 class T5LayerCrossAttention(nn.Module):
@@ -426,8 +420,7 @@ class T5Block(nn.Module):
                                                 position_bias=position_bias,
                                                 head_mask=head_mask)
         hidden_states = self_attention_outputs[0]
-        special_out = self_attention_outputs[-1]
-        outputs = self_attention_outputs[1:-1]  # Keep self-attention outputs and relative position weights
+        outputs = self_attention_outputs[1:]  # Keep self-attention outputs and relative position weights
 
         if not self.is_decoder:
             hidden_states = self.layer[1](hidden_states)
@@ -442,7 +435,7 @@ class T5Block(nn.Module):
             hidden_states = self.layer[2](hidden_states)
 
         outputs = (hidden_states,) + outputs  # add attentions if we output them
-        return outputs + (special_out,)  # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
+        return outputs  # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
 
 
 class T5PreTrainedModel(PreTrainedModel):
@@ -536,6 +529,10 @@ class T5Stack(T5PreTrainedModel):
         # positions we want to attend and -1e9 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
+
+        # T5 has a mask that can compare sequence ids, we simulate this here with this transposistion
+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+        extended_attention_mask = (extended_attention_mask == extended_attention_mask.transpose(-1, -2))
         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
 
@@ -584,8 +581,6 @@ class T5Stack(T5PreTrainedModel):
                                          encoder_attention_mask=encoder_extended_attention_mask,
                                          encoder_decoder_position_bias=encoder_decoder_position_bias,
                                          head_mask=head_mask[i])
-            if i == 0:
-                special_out = layer_outputs[-1]
             # layer_outputs is a tuple with:
             # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
             hidden_states = layer_outputs[0]
@@ -610,7 +605,7 @@ class T5Stack(T5PreTrainedModel):
             outputs = outputs + (all_hidden_states,)
         if self.output_attentions:
             outputs = outputs + (all_attentions,)
-        return outputs + (special_out,)  # last-layer hidden state, (all hidden states), (all attentions)
+        return outputs  # last-layer hidden state, (all hidden states), (all attentions)
 
 
 T5_START_DOCSTRING = r"""    The T5 model was proposed in

From b016dd16c90c2c18168d13bca6d5002729fd5b0a Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 9 Dec 2019 21:38:07 +0100
Subject: [PATCH 022/110] fix tests on python 3.5

---
 transformers/modeling_t5.py                |  2 +-
 transformers/tests/modeling_common_test.py | 15 ++++++++-------
 transformers/tokenization_t5.py            |  2 +-
 3 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index 104e9060fc..e48293b49e 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -338,7 +338,7 @@ class T5Attention(nn.Module):
                 raise ValueError("No position_bias provided and no weights to compute position_bias")
             position_bias = self.compute_bias(qlen, klen)
             if mask is not None:
-                position_bias += mask                                         # (bs, n_heads, qlen, klen)
+                position_bias = position_bias + mask                          # (bs, n_heads, qlen, klen)
 
         scores += position_bias
         weights = F.softmax(scores.float(), dim=-1).type_as(scores)           # (bs, n_heads, qlen, klen)
diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index ee75da605c..11aeaafe31 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -138,8 +138,8 @@ class CommonTestCases:
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
                     [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
+                    self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length])
                 out_len = len(outputs)
 
                 if self.is_encoder_decoder:
@@ -151,8 +151,8 @@ class CommonTestCases:
                     self.assertListEqual(
                         list(decoder_attentions[0].shape[-3:]),
                         [self.model_tester.num_attention_heads,
-                         self.model_tester.seq_length,
-                         self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                         self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length,
+                         self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length])
 
                 # Check attention is always last and order is fine
                 config.output_attentions = True
@@ -169,8 +169,8 @@ class CommonTestCases:
                 self.assertListEqual(
                     list(self_attentions[0].shape[-3:]),
                     [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
+                    self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length])
 
         def test_torchscript(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
@@ -440,7 +440,8 @@ class CommonTestCases:
                 self.assertEqual(len(hidden_states), self.model_tester.num_hidden_layers + 1)
                 self.assertListEqual(
                     list(hidden_states[0].shape[-2:]),
-                    [self.model_tester.seq_length, self.model_tester.hidden_size])
+                    [self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
+                     self.model_tester.hidden_size])
 
         def test_resize_tokens_embeddings(self):
             original_config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py
index 3847aeefbf..933084d13a 100644
--- a/transformers/tokenization_t5.py
+++ b/transformers/tokenization_t5.py
@@ -134,7 +134,7 @@ class T5Tokenizer(PreTrainedTokenizer):
         """ Converts a token (str/unicode) in an id using the vocab. """
         if token.startswith(u"<extra_id_"):
             l = re.match(r'<extra_id_(\d+)>', token)
-            num = int(l[1])
+            num = int(l.group(1))
             return self.vocab_size - num - 1
         return self.sp_model.piece_to_id(token)
 

From 808bb8da7edbd9f5858b3c223ebac9bd83275934 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 9 Dec 2019 21:48:34 +0100
Subject: [PATCH 023/110] fix transfo xl tests

---
 transformers/tests/modeling_common_test.py     | 18 ++++++++++++------
 .../tests/modeling_tf_transfo_xl_test.py       |  2 +-
 transformers/tests/modeling_transfo_xl_test.py |  2 +-
 3 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index 11aeaafe31..7033a06d0b 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -125,6 +125,11 @@ class CommonTestCases:
         def test_attention_outputs(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
+            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
+            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
+            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
+            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
+
             for model_class in self.all_model_classes:
                 config.output_attentions = True
                 config.output_hidden_states = False
@@ -138,8 +143,8 @@ class CommonTestCases:
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
                     [self.model_tester.num_attention_heads,
-                    self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
-                    self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length])
+                    encoder_seq_length ,
+                    encoder_key_length])
                 out_len = len(outputs)
 
                 if self.is_encoder_decoder:
@@ -151,8 +156,9 @@ class CommonTestCases:
                     self.assertListEqual(
                         list(decoder_attentions[0].shape[-3:]),
                         [self.model_tester.num_attention_heads,
-                         self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length,
-                         self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length])
+                         decoder_seq_length,
+                         decoder_key_length
+                         ])
 
                 # Check attention is always last and order is fine
                 config.output_attentions = True
@@ -169,8 +175,8 @@ class CommonTestCases:
                 self.assertListEqual(
                     list(self_attentions[0].shape[-3:]),
                     [self.model_tester.num_attention_heads,
-                    self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length,
-                    self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length])
+                    encoder_seq_length,
+                    encoder_key_length])
 
         def test_torchscript(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/transformers/tests/modeling_tf_transfo_xl_test.py b/transformers/tests/modeling_tf_transfo_xl_test.py
index 534fe39646..8ebd749b4c 100644
--- a/transformers/tests/modeling_tf_transfo_xl_test.py
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
@@ -68,7 +68,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
             self.batch_size = batch_size
             self.seq_length = seq_length
             self.mem_len = mem_len
-            self.key_len = seq_length + mem_len
+            self.key_length = seq_length + mem_len
             self.clamp_len = clamp_len
             self.is_training = is_training
             self.use_labels = use_labels
diff --git a/transformers/tests/modeling_transfo_xl_test.py b/transformers/tests/modeling_transfo_xl_test.py
index f7b913da5b..2d1541d87b 100644
--- a/transformers/tests/modeling_transfo_xl_test.py
+++ b/transformers/tests/modeling_transfo_xl_test.py
@@ -66,7 +66,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
             self.batch_size = batch_size
             self.seq_length = seq_length
             self.mem_len = mem_len
-            self.key_len = seq_length + mem_len
+            self.key_length = seq_length + mem_len
             self.clamp_len = clamp_len
             self.is_training = is_training
             self.use_labels = use_labels

From 8e651f56b75982f07fc522b62f298d8d70e6e56f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 9 Dec 2019 22:13:57 +0100
Subject: [PATCH 024/110] fix tf tests

---
 transformers/tests/modeling_tf_common_test.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index 20ccfd8ce0..26bd037c9e 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -213,6 +213,11 @@ class TFCommonTestCases:
         def test_attention_outputs(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
 
+            decoder_seq_length = self.model_tester.decoder_seq_length if hasattr(self.model_tester, 'decoder_seq_length') else self.model_tester.seq_length
+            encoder_seq_length = self.model_tester.encoder_seq_length if hasattr(self.model_tester, 'encoder_seq_length') else self.model_tester.seq_length
+            decoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else decoder_seq_length
+            encoder_key_length = self.model_tester.key_length if hasattr(self.model_tester, 'key_length') else encoder_seq_length
+
             for model_class in self.all_model_classes:
                 config.output_attentions = True
                 config.output_hidden_states = False
@@ -225,8 +230,8 @@ class TFCommonTestCases:
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
                     [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_seq_length,
+                    encoder_key_length])
                 out_len = len(outputs)
 
                 if self.is_encoder_decoder:
@@ -238,8 +243,8 @@ class TFCommonTestCases:
                     self.assertListEqual(
                         list(decoder_attentions[0].shape[-3:]),
                         [self.model_tester.num_attention_heads,
-                         self.model_tester.seq_length,
-                         self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                         decoder_seq_length,
+                         decoder_key_length])
 
                 # Check attention is always last and order is fine
                 config.output_attentions = True
@@ -255,8 +260,8 @@ class TFCommonTestCases:
                 self.assertListEqual(
                     list(attentions[0].shape[-3:]),
                     [self.model_tester.num_attention_heads,
-                    self.model_tester.seq_length,
-                    self.model_tester.key_len if hasattr(self.model_tester, 'key_len') else self.model_tester.seq_length])
+                    encoder_seq_length,
+                    encoder_key_length])
 
         def test_headmasking(self):
             pass

From 608a8f5b567f81f3cc997a195496dd8bf1c28158 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 10 Dec 2019 10:01:01 +0100
Subject: [PATCH 025/110] updating tf 2.0 layer_norm to T5 layer norm

---
 transformers/modeling_tf_t5.py | 43 ++++++++++++++++++++++++----------
 1 file changed, 30 insertions(+), 13 deletions(-)

diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
index c1de4745c2..11762ee1e5 100644
--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
@@ -17,16 +17,11 @@
 
 from __future__ import absolute_import, division, print_function, unicode_literals
 
-import json
 import logging
 import math
-import os
-import sys
 import copy
 import itertools
-from io import open
 
-import numpy as np
 import tensorflow as tf
 
 from .configuration_t5 import T5Config
@@ -45,6 +40,28 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
 # - TFPreTrainedModel for the models (it-self a sub-class of tf.keras.Model)
 ####################################################
 
+class TFT5LayerNorm(tf.keras.layers.Layer):
+    def __init__(self, epsilon=1e-6, **kwargs):
+        """ Construct a layernorm module in the T5 style
+            No bias and no substraction of mean.
+        """
+        super(TFT5LayerNorm, self).__init__(**kwargs)
+        self.variance_epsilon = epsilon
+
+    def build(self, input_shape):
+        """Build shared word embedding layer """
+        self.weight = self.add_weight(
+            "weight",
+            shape=(input_shape[-1],),
+            initializer='ones')
+        super(TFT5LayerNorm, self).build(input_shape)
+
+    def call(self, x):
+        variance = tf.math.reduce_min(tf.math.square(x), axis=-1, keepdims=True)
+        x = x * tf.math.rsqrt(variance + self.variance_epsilon)
+        return self.weight * x
+
+
 class TFT5DenseReluDense(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFT5DenseReluDense, self).__init__(**kwargs)
@@ -65,8 +82,8 @@ class TFT5LayerFF(tf.keras.layers.Layer):
     def __init__(self, config, **kwargs):
         super(TFT5LayerFF, self).__init__(**kwargs)
         self.DenseReluDense = TFT5DenseReluDense(config, name='DenseReluDense')
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon,
-                                                             name='layer_norm')
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
+                                        name='layer_norm')
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
     def call(self, hidden_states, training=False):
@@ -249,8 +266,8 @@ class TFT5LayerSelfAttention(tf.keras.layers.Layer):
         self.SelfAttention = TFT5Attention(config,
                                            has_relative_attention_bias=has_relative_attention_bias,
                                            name='SelfAttention')
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon,
-                                                             name='layer_norm')
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
+                                        name='layer_norm')
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
     def call(self, hidden_states, attention_mask=None, position_bias=None,
@@ -273,8 +290,8 @@ class TFT5LayerCrossAttention(tf.keras.layers.Layer):
         self.EncDecAttention = TFT5Attention(config,
                                            has_relative_attention_bias=has_relative_attention_bias,
                                            name='EncDecAttention')
-        self.layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon,
-                                                             name='layer_norm')
+        self.layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
+                                        name='layer_norm')
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
     def call(self, hidden_states, kv, attention_mask=None, position_bias=None,
@@ -353,8 +370,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
                                 has_relative_attention_bias=bool(i == 0),
                                 name='block_._{}'.format(i))
                         for i in range(config.num_layers)]
-        self.final_layer_norm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon,
-                                                                   name='final_layer_norm')
+        self.final_layer_norm = TFT5LayerNorm(epsilon=config.layer_norm_epsilon,
+                                              name='final_layer_norm')
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
     def _resize_token_embeddings(self, new_num_tokens):

From 8ae1044f80ef543e4657c97d1030649d4da15aa8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 10 Dec 2019 15:11:07 +0100
Subject: [PATCH 026/110] updating tests and TF 2.0 model

---
 transformers/modeling_t5.py                   | 31 ++++++---
 transformers/modeling_tf_t5.py                | 44 ++++++++++---
 transformers/tests/modeling_common_test.py    | 18 +++--
 transformers/tests/modeling_t5_test.py        |  9 ++-
 transformers/tests/modeling_tf_common_test.py | 65 +++++++++++--------
 transformers/tests/modeling_tf_t5_test.py     | 10 +--
 transformers/tests/tokenization_t5_test.py    |  1 -
 7 files changed, 121 insertions(+), 57 deletions(-)

diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index e48293b49e..f1e4e0306c 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -726,8 +726,11 @@ class T5Model(T5PreTrainedModel):
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
         encoder_attention_mask = kwargs_encoder.get("attention_mask", None)
         if encoder_hidden_states is None:
-            encoder_inputs_ids = kwargs_encoder.pop("input_ids")
-            hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+            # Convert encoder inputs in embeddings if needed
+            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
+            if hidden_states is None:
+                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
 
             if encoder_attention_mask is not None:
                 # Apply masking
@@ -740,8 +743,12 @@ class T5Model(T5PreTrainedModel):
             encoder_outputs = ()
 
         # Decode
-        decoder_inputs_ids = kwargs_decoder.pop("input_ids")
-        hidden_states = self.shared(decoder_inputs_ids)  # Convert inputs in embeddings
+        # Convert decoder inputs in embeddings if needed
+        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
+        if hidden_states is None:
+            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+            hidden_states = self.shared(decoder_inputs_ids)
+
         kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
         kwargs_decoder["encoder_attention_mask"] = encoder_attention_mask
         decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
@@ -825,16 +832,24 @@ class T5WithLMHeadModel(T5PreTrainedModel):
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
         if encoder_hidden_states is None:
-            encoder_inputs_ids = kwargs_encoder.pop("input_ids")
-            hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+            # Convert encoder inputs in embeddings if needed
+            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
+            if hidden_states is None:
+                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+
             encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
             encoder_hidden_states = encoder_outputs[0]
         else:
             encoder_outputs = ()
 
         # Decode
-        decoder_inputs_ids = kwargs_decoder.pop("input_ids")
-        hidden_states = self.shared(decoder_inputs_ids)  # Convert inputs in embeddings
+        # Convert decoder inputs in embeddings if needed
+        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
+        if hidden_states is None:
+            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+            hidden_states = self.shared(decoder_inputs_ids)
+
         kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
         kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
         decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
index 11762ee1e5..447fd69b05 100644
--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
@@ -613,6 +613,12 @@ class TFT5Model(TFT5PreTrainedModel):
         decoder_config.is_decoder = True
         self.decoder = TFT5MainLayer(decoder_config, name='decoder')
 
+    def get_input_embeddings(self):
+        return self.shared
+
+    def get_output_embeddings(self):
+        return self.shared
+
     def call(self, decoder_input_ids, **kwargs):
         # We allow two types of multi-inputs:
         # - traditional keyword arguments in the call method
@@ -634,16 +640,24 @@ class TFT5Model(TFT5PreTrainedModel):
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
         if encoder_hidden_states is None:
-            encoder_inputs_ids = kwargs_encoder.pop("input_ids")
-            hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+            # Convert encoder inputs in embeddings if needed
+            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
+            if hidden_states is None:
+                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+
             encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
             encoder_hidden_states = encoder_outputs[0]
         else:
             encoder_outputs = ()
 
         # Decode
-        decoder_inputs_ids = kwargs_decoder.pop("input_ids")
-        hidden_states = self.shared(decoder_inputs_ids)  # Convert inputs in embeddings
+        # Convert decoder inputs in embeddings if needed
+        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
+        if hidden_states is None:
+            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+            hidden_states = self.shared(decoder_inputs_ids)
+
         kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
         kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
         decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
@@ -692,6 +706,12 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel):
         decoder_config.is_decoder = True
         self.decoder = TFT5MainLayer(decoder_config, name='decoder')
 
+    def get_input_embeddings(self):
+        return self.shared
+
+    def get_output_embeddings(self):
+        return self.shared
+
     def call(self, decoder_input_ids, **kwargs):
         # We allow two types of multi-inputs:
         # - traditional keyword arguments in the call method
@@ -713,16 +733,24 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel):
         # Encode if needed (training, first prediction pass)
         encoder_hidden_states = kwargs_encoder.pop("hidden_states", None)
         if encoder_hidden_states is None:
-            encoder_inputs_ids = kwargs_encoder.pop("input_ids")
-            hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+            # Convert encoder inputs in embeddings if needed
+            hidden_states = kwargs_encoder.pop("inputs_embeds", None)
+            if hidden_states is None:
+                encoder_inputs_ids = kwargs_encoder.pop("input_ids")
+                hidden_states = self.shared(encoder_inputs_ids)  # Convert inputs in embeddings
+
             encoder_outputs = self.encoder(hidden_states, **kwargs_encoder)
             encoder_hidden_states = encoder_outputs[0]
         else:
             encoder_outputs = ()
 
         # Decode
-        decoder_inputs_ids = kwargs_decoder.pop("input_ids")
-        hidden_states = self.shared(decoder_inputs_ids)  # Convert inputs in embeddings
+        # Convert decoder inputs in embeddings if needed
+        hidden_states = kwargs_decoder.pop("inputs_embeds", None)
+        if hidden_states is None:
+            decoder_inputs_ids = kwargs_decoder.pop("input_ids")
+            hidden_states = self.shared(decoder_inputs_ids)
+
         kwargs_decoder["encoder_hidden_states"] = encoder_hidden_states
         kwargs_decoder["encoder_attention_mask"] = kwargs_encoder.get("attention_mask", None)
         decoder_outputs = self.decoder(hidden_states, **kwargs_decoder)
diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index cdfbfc09e2..792f5cee3e 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -568,8 +568,14 @@ class CommonTestCases:
 
         def test_inputs_embeds(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            input_ids = inputs_dict["input_ids"]
-            del inputs_dict["input_ids"]
+            if not self.is_encoder_decoder:
+                input_ids = inputs_dict["input_ids"]
+                del inputs_dict["input_ids"]
+            else:
+                encoder_input_ids = inputs_dict["encoder_input_ids"]
+                decoder_input_ids = inputs_dict["decoder_input_ids"]
+                del inputs_dict["encoder_input_ids"]
+                del inputs_dict["decoder_input_ids"]
 
             for model_class in self.all_model_classes:
                 model = model_class(config)
@@ -577,9 +583,13 @@ class CommonTestCases:
                 model.eval()
 
                 wte = model.get_input_embeddings()
-                inputs_dict["inputs_embeds"] = wte(input_ids)
-                outputs = model(**inputs_dict)
+                if not self.is_encoder_decoder:
+                    inputs_dict["inputs_embeds"] = wte(input_ids)
+                else:
+                    inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids)
+                    inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)
 
+                outputs = model(**inputs_dict)
 
     class GPTModelTester(CommonModelTester):
 
diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py
index 091bd742b5..a539cc868a 100644
--- a/transformers/tests/modeling_t5_test.py
+++ b/transformers/tests/modeling_t5_test.py
@@ -18,20 +18,19 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 
 from transformers import is_torch_available
 
-from .modeling_common_test import (CommonTestCases, ids_tensor)
+from .modeling_common_test import (CommonTestCases, ids_tensor, floats_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_torch, slow, torch_device
 
 if is_torch_available():
     from transformers import (T5Config, T5Model, T5WithLMHeadModel)
     from transformers.modeling_t5 import T5_PRETRAINED_MODEL_ARCHIVE_MAP
-else:
-    pytestmark = pytest.mark.skip("Require Torch")
 
 
+@require_torch
 class T5ModelTest(CommonTestCases.CommonModelTester):
 
     all_model_classes = (T5Model, T5WithLMHeadModel) if is_torch_available() else ()
@@ -174,7 +173,7 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in list(T5_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index 8957313021..a0d63583fb 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -130,12 +130,12 @@ class TFCommonTestCases:
                                       for name, key in inputs_dict.items())
                 with torch.no_grad():
                     pto = pt_model(**pt_inputs_dict)
-                tfo = tf_model(inputs_dict)
-                tfo = tfo[0].numpy()
-                pto = pto[0].numpy()
-                tfo[np.isnan(tfo)] = 0
-                pto[np.isnan(pto)] = 0
-                max_diff = np.amax(np.abs(tfo - pto))
+                tfo = tf_model(inputs_dict, training=False)
+                tf_hidden_states = tfo[0].numpy()
+                pt_hidden_states = pto[0].numpy()
+                tf_hidden_states[np.isnan(tf_hidden_states)] = 0
+                pt_hidden_states[np.isnan(pt_hidden_states)] = 0
+                max_diff = np.amax(np.abs(tf_hidden_states - pt_hidden_states))
                 self.assertLessEqual(max_diff, 2e-2)
 
                 # Check we can load pt model in tf and vice-versa with checkpoint => model functions
@@ -296,33 +296,46 @@ class TFCommonTestCases:
                 first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
                 self.assertTrue(tf.math.equal(first, second).numpy().all())
 
+        def _get_embeds(self, wte, input_ids):
+            # ^^ In our TF models, the input_embeddings can take slightly different forms,
+            # so we try a few of them.
+            # We used to fall back to just synthetically creating a dummy tensor of ones:
+            try:
+                x = wte(input_ids, mode="embedding")
+            except:
+                try:
+                    x = wte([input_ids], mode="embedding")
+                except:
+                    try:
+                        x = wte([input_ids, None, None, None], mode="embedding")
+                    except:
+                        if hasattr(self.model_tester, "embedding_size"):
+                            x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
+                        else:
+                            x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
+            return x
+
         def test_inputs_embeds(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
-            input_ids = inputs_dict["input_ids"]
-            del inputs_dict["input_ids"]
+            if not self.is_encoder_decoder:
+                input_ids = inputs_dict["input_ids"]
+                del inputs_dict["input_ids"]
+            else:
+                encoder_input_ids = inputs_dict["encoder_input_ids"]
+                decoder_input_ids = inputs_dict["decoder_input_ids"]
+                del inputs_dict["encoder_input_ids"]
+                del inputs_dict["decoder_input_ids"]
 
             for model_class in self.all_model_classes:
                 model = model_class(config)
 
                 wte = model.get_input_embeddings()
-                try:
-                    x = wte(input_ids, mode="embedding")
-                except:
-                    try:
-                        x = wte([input_ids], mode="embedding")
-                    except:
-                        try:
-                            x = wte([input_ids, None, None, None], mode="embedding")
-                        except:
-                            if hasattr(self.model_tester, "embedding_size"):
-                                x = tf.ones(input_ids.shape + [self.model_tester.embedding_size], dtype=tf.dtypes.float32)
-                            else:
-                                x = tf.ones(input_ids.shape + [self.model_tester.hidden_size], dtype=tf.dtypes.float32)
-                # ^^ In our TF models, the input_embeddings can take slightly different forms,
-                # so we try a few of them.
-                # We used to fall back to just synthetically creating a dummy tensor of ones:
-                #
-                inputs_dict["inputs_embeds"] = x
+                if not self.is_encoder_decoder:
+                    inputs_dict["inputs_embeds"] = self._get_embeds(wte, input_ids)
+                else:
+                    inputs_dict["encoder_inputs_embeds"] = self._get_embeds(wte, encoder_input_ids)
+                    inputs_dict["decoder_inputs_embeds"] = self._get_embeds(wte, decoder_input_ids)
+
                 outputs = model(inputs_dict)
 
 
diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py
index 33f6f895f0..99eec313f9 100644
--- a/transformers/tests/modeling_tf_t5_test.py
+++ b/transformers/tests/modeling_tf_t5_test.py
@@ -18,21 +18,21 @@ from __future__ import print_function
 
 import unittest
 import shutil
-import pytest
 import sys
 
 from .modeling_tf_common_test import (TFCommonTestCases, ids_tensor)
 from .configuration_common_test import ConfigTester
+from .utils import require_tf, slow
 
 from transformers import T5Config, is_tf_available
 
 if is_tf_available():
     import tensorflow as tf
-    from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
-else:
-    pytestmark = pytest.mark.skip("Require TensorFlow")
+    from transformers.modeling_tf_t5 import (TFT5Model, TFT5WithLMHeadModel,
+                                             TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP)
 
 
+@require_tf
 class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
 
     is_encoder_decoder = True
@@ -160,7 +160,7 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
         config_and_inputs = self.model_tester.prepare_config_and_inputs()
         self.model_tester.create_and_check_t5_with_lm_head(*config_and_inputs)
 
-    @pytest.mark.slow
+    @slow
     def test_model_from_pretrained(self):
         cache_dir = "/tmp/transformers_test/"
         for model_name in ['t5-small']:
diff --git a/transformers/tests/tokenization_t5_test.py b/transformers/tests/tokenization_t5_test.py
index aabb21e443..0b4f960e32 100644
--- a/transformers/tests/tokenization_t5_test.py
+++ b/transformers/tests/tokenization_t5_test.py
@@ -16,7 +16,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import pytest
 
 from transformers.tokenization_t5 import (T5Tokenizer)
 from transformers.tokenization_xlnet import SPIECE_UNDERLINE

From 4b82c485de187896a38c441587b7bd4d04f2821e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi=20Louf?= <remilouf@gmail.com>
Date: Tue, 10 Dec 2019 14:49:53 +0100
Subject: [PATCH 027/110] remove misplaced summarization documentation

---
 examples/README.md | 30 ------------------------------
 1 file changed, 30 deletions(-)

diff --git a/examples/README.md b/examples/README.md
index 620304ea77..b6b3908810 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -24,8 +24,6 @@ pip install -r ./examples/requirements.txt
 | [Multiple Choice](#multiple-choice) | Examples running BERT/XLNet/RoBERTa on the SWAG/RACE/ARC tasks. 
 | [Named Entity Recognition](#named-entity-recognition) | Using BERT for Named Entity Recognition (NER) on the CoNLL 2003 dataset, examples with distributed training.                                                                                  |
 | [XNLI](#xnli) | Examples running BERT/XLM on the XNLI benchmark. |
-| [Abstractive summarization](#abstractive-summarization) | Using the BertAbs
-model finetuned on the CNN/DailyMail dataset to generate summaries. |
 
 ## TensorFlow 2.0 Bert models on GLUE
 
@@ -646,34 +644,6 @@ micro avg     0.8722    0.8774    0.8748     13869
 macro avg     0.8712    0.8774    0.8740     13869
 ```
 
-## Abstractive summarization
-
-Based on the script
-[`run_summarization_finetuning.py`](https://github.com/huggingface/transformers/blob/master/examples/run_summarization_finetuning.py).
-
-Before running this script you should download **both** CNN and Daily Mail
-datasets from [Kyunghyun Cho's website](https://cs.nyu.edu/~kcho/DMQA/)  (the
-links next to "Stories") in the same folder. Then uncompress the archives by running:
-
-```bash
-tar -xvf cnn_stories.tgz && tar -xvf dailymail_stories.tgz
-```
-
-note that the finetuning script **will not work** if you do not download both
-datasets. We will refer as `$DATA_PATH` the path to where you uncompressed both
-archive.
-
-```bash
-export DATA_PATH=/path/to/dataset/
-
-python run_summarization_finetuning.py \
-    --output_dir=output \
-    --model_type=bert2bert \
-    --model_name_or_path=bert2bert \
-    --do_train \
-    --data_path=$DATA_PATH \
-```
-
 ## XNLI
 
 Based on the script [`run_xnli.py`](https://github.com/huggingface/transformers/blob/master/examples/run_xnli.py).

From 981a5c8c1789f91204ba1053f4742f6ea8c615af Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 10 Dec 2019 15:36:19 +0100
Subject: [PATCH 028/110] updating models urls

---
 transformers/configuration_t5.py                  |  4 ++++
 transformers/convert_pytorch_checkpoint_to_tf2.py |  2 +-
 transformers/modeling_t5.py                       |  4 ++++
 transformers/modeling_tf_t5.py                    |  6 +++++-
 transformers/tokenization_t5.py                   | 12 ++++++++++--
 5 files changed, 24 insertions(+), 4 deletions(-)

diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py
index 83aab66fac..2ccdebc2b1 100644
--- a/transformers/configuration_t5.py
+++ b/transformers/configuration_t5.py
@@ -28,6 +28,10 @@ logger = logging.getLogger(__name__)
 
 T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
+    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
+    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
+    't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-config.json",
+    't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-config.json",
 }
 
 
diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index 4c4becfa00..06bb5f47c0 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -121,7 +121,7 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
 
     if compare_with_pt_model:
         inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-        tf_inputs = tf.constant(inputs_list)
+        tf_inputs = tf_model.dummy_inputs
         tfo = tf_model(tf_inputs, training=False)  # build the network
 
         pt_model = pt_model_class.from_pretrained(None,
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index f1e4e0306c..ffc4d8bb3f 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -42,6 +42,10 @@ logger = logging.getLogger(__name__)
 ####################################################
 T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
     't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
+    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
+    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
+    't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-pytorch_model.bin",
+    't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-pytorch_model.bin",
 }
 
 ####################################################
diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
index 447fd69b05..0b3b1116f2 100644
--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
@@ -25,13 +25,17 @@ import itertools
 import tensorflow as tf
 
 from .configuration_t5 import T5Config
-from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list, get_initializer, DUMMY_INPUTS
+from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list
 from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
 TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
     't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5",
+    't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5",
+    't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5",
+    't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-tf_model.h5",
+    't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-tf_model.h5",
 }
 
 ####################################################
diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py
index 933084d13a..62e9c069e2 100644
--- a/transformers/tokenization_t5.py
+++ b/transformers/tokenization_t5.py
@@ -41,7 +41,11 @@ VOCAB_FILES_NAMES = {'vocab_file': 'spiece.model'}
 PRETRAINED_VOCAB_FILES_MAP = {
     'vocab_file':
     {
-        't5': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
     }
 }
 
@@ -49,7 +53,11 @@ PRETRAINED_VOCAB_FILES_MAP = {
 # Mapping from model shortcut names to max length of inputs
 ####################################################
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    't5': 512,
+    't5-small': 512,
+    't5-base': 512,
+    't5-large': 512,
+    't5-3B': 512,
+    't5-11B': 512,
 }
 
 class T5Tokenizer(PreTrainedTokenizer):

From a5df980c5b86e9106382a87a63b977d5decf97f6 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 10 Dec 2019 16:01:15 +0100
Subject: [PATCH 029/110] updating distilbert test

---
 transformers/tests/modeling_common_test.py    | 7 ++++++-
 transformers/tests/modeling_tf_common_test.py | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index 792f5cee3e..2f2baff436 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -121,7 +121,12 @@ class CommonTestCases:
                 model.to(torch_device)
                 model.eval()
                 first, second = model(**inputs_dict)[0], model(**inputs_dict)[0]
-                self.assertEqual(first.ne(second).sum().item(), 0)
+                out_1 = first.cpu().numpy()
+                out_2 = second.cpu().numpy()
+                out_1 = out_1[~np.isnan(out_1)]
+                out_2 = out_2[~np.isnan(out_2)]
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
 
         def test_attention_outputs(self):
             config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
diff --git a/transformers/tests/modeling_tf_common_test.py b/transformers/tests/modeling_tf_common_test.py
index a0d63583fb..5a5873e81b 100644
--- a/transformers/tests/modeling_tf_common_test.py
+++ b/transformers/tests/modeling_tf_common_test.py
@@ -294,7 +294,12 @@ class TFCommonTestCases:
             for model_class in self.all_model_classes:
                 model = model_class(config)
                 first, second = model(inputs_dict, training=False)[0], model(inputs_dict, training=False)[0]
-                self.assertTrue(tf.math.equal(first, second).numpy().all())
+                out_1 = first.numpy()
+                out_2 = second.numpy()
+                out_1 = out_1[~np.isnan(out_1)]
+                out_2 = out_2[~np.isnan(out_2)]
+                max_diff = np.amax(np.abs(out_1 - out_2))
+                self.assertLessEqual(max_diff, 1e-5)
 
         def _get_embeds(self, wte, input_ids):
             # ^^ In our TF models, the input_embeddings can take slightly different forms,

From f2538c12741df74abbd2ff38f43019cfbb21093b Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 10 Dec 2019 16:33:11 +0100
Subject: [PATCH 030/110] all tests in torch no grad

---
 transformers/tests/modeling_common_test.py | 53 ++++++++++++++--------
 1 file changed, 35 insertions(+), 18 deletions(-)

diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index 2f2baff436..ed6f950e25 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -120,7 +120,9 @@ class CommonTestCases:
                 model = model_class(config)
                 model.to(torch_device)
                 model.eval()
-                first, second = model(**inputs_dict)[0], model(**inputs_dict)[0]
+                with torch.no_grad():
+                    first = model(**inputs_dict)[0]
+                    second = model(**inputs_dict)[0]
                 out_1 = first.cpu().numpy()
                 out_2 = second.cpu().numpy()
                 out_1 = out_1[~np.isnan(out_1)]
@@ -142,7 +144,8 @@ class CommonTestCases:
                 model = model_class(config)
                 model.to(torch_device)
                 model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 attentions = outputs[-1]
                 self.assertEqual(model.config.output_attentions, True)
                 self.assertEqual(model.config.output_hidden_states, False)
@@ -173,7 +176,8 @@ class CommonTestCases:
                 model = model_class(config)
                 model.to(torch_device)
                 model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 self.assertEqual(out_len + (2 if self.is_encoder_decoder else 1), len(outputs))
                 self.assertEqual(model.config.output_attentions, True)
                 self.assertEqual(model.config.output_hidden_states, True)
@@ -273,7 +277,8 @@ class CommonTestCases:
                 inputs = inputs_dict.copy()
                 inputs['head_mask'] = head_mask
 
-                outputs = model(**inputs)
+                with torch.no_grad():
+                    outputs = model(**inputs)
 
                 # Test that we can get a gradient back for importance score computation
                 output = sum(t.sum() for t in outputs[0])
@@ -320,7 +325,8 @@ class CommonTestCases:
                 heads_to_prune = {0: list(range(1, self.model_tester.num_attention_heads)),
                                 -1: [0]}
                 model.prune_heads(heads_to_prune)
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
 
                 attentions = outputs[-1]
 
@@ -356,7 +362,8 @@ class CommonTestCases:
                 model = model_class.from_pretrained(directory)
                 model.to(torch_device)
 
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 attentions = outputs[-1]
                 self.assertEqual(attentions[0].shape[-3], 1)
                 self.assertEqual(attentions[1].shape[-3], self.model_tester.num_attention_heads)
@@ -385,7 +392,8 @@ class CommonTestCases:
                 model.to(torch_device)
                 model.eval()
 
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 attentions = outputs[-1]
 
                 self.assertEqual(attentions[0].shape[-3], 1)
@@ -412,7 +420,8 @@ class CommonTestCases:
                 model.to(torch_device)
                 model.eval()
 
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 attentions = outputs[-1]
 
                 self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
@@ -429,7 +438,8 @@ class CommonTestCases:
                 model.to(torch_device)
                 shutil.rmtree(directory)
 
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 attentions = outputs[-1]
 
                 self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads - 1)
@@ -440,7 +450,8 @@ class CommonTestCases:
                 heads_to_prune = {0: [0], 2: [1, 2]}
                 model.prune_heads(heads_to_prune)
 
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 attentions = outputs[-1]
 
                 self.assertEqual(attentions[0].shape[-3], self.model_tester.num_attention_heads -1)
@@ -459,7 +470,8 @@ class CommonTestCases:
                 model = model_class(config)
                 model.to(torch_device)
                 model.eval()
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
                 hidden_states = outputs[-1]
                 self.assertEqual(model.config.output_attentions, False)
                 self.assertEqual(model.config.output_hidden_states, True)
@@ -594,7 +606,8 @@ class CommonTestCases:
                     inputs_dict["encoder_inputs_embeds"] = wte(encoder_input_ids)
                     inputs_dict["decoder_inputs_embeds"] = wte(decoder_input_ids)
 
-                outputs = model(**inputs_dict)
+                with torch.no_grad():
+                    outputs = model(**inputs_dict)
 
     class GPTModelTester(CommonModelTester):
 
@@ -682,9 +695,10 @@ class CommonTestCases:
             model.to(torch_device)
             model.eval()
 
-            outputs = model(input_ids, position_ids, token_type_ids)
-            outputs = model(input_ids, position_ids)
-            outputs = model(input_ids)
+            with torch.no_grad():
+                outputs = model(input_ids, position_ids, token_type_ids)
+                outputs = model(input_ids, position_ids)
+                outputs = model(input_ids)
 
             hidden_state = outputs[0]
             self.parent.assertListEqual(
@@ -697,7 +711,8 @@ class CommonTestCases:
             model = self.lm_head_model_class(config)
             model.to(torch_device)
             model.eval()
-            outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
+            with torch.no_grad():
+                outputs = model(input_ids, position_ids, token_type_ids, lm_labels)
             loss, lm_logits = outputs[:2]
 
             total_voc = self.vocab_size
@@ -714,7 +729,8 @@ class CommonTestCases:
                 model = model_class(config)
                 model.to(torch_device)
                 model.eval()
-                outputs = model(input_ids)
+                with torch.no_grad():
+                    outputs = model(input_ids)
                 presents = outputs[-1]
                 self.parent.assertEqual(self.num_hidden_layers, len(presents))
                 self.parent.assertListEqual(
@@ -727,7 +743,8 @@ class CommonTestCases:
             model = self.double_head_model_class(config)
             model.to(torch_device)
             model.eval()
-            outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
+            with torch.no_grad():
+                outputs = model(input_ids, mc_token_ids, lm_labels=lm_labels, mc_labels=mc_labels,
                             token_type_ids=token_type_ids, position_ids=position_ids)
             lm_loss, mc_loss, lm_logits, mc_logits = outputs[:4]
             loss = [lm_loss, mc_loss]

From 67a8be8e90a7fbd5e0bceff9f29fb89ccabb61be Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 10 Dec 2019 17:50:32 +0100
Subject: [PATCH 031/110] fix backward in tests

---
 transformers/tests/modeling_common_test.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index ed6f950e25..cd4cf247a6 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -277,8 +277,7 @@ class CommonTestCases:
                 inputs = inputs_dict.copy()
                 inputs['head_mask'] = head_mask
 
-                with torch.no_grad():
-                    outputs = model(**inputs)
+                outputs = model(**inputs)
 
                 # Test that we can get a gradient back for importance score computation
                 output = sum(t.sum() for t in outputs[0])

From dc4e9e5cb36ae9bf5185b49b1cbc9106857abd54 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 10 Dec 2019 19:21:20 +0000
Subject: [PATCH 032/110] DataParallel for SQuAD + fix XLM

---
 examples/run_squad.py                      | 6 +++++-
 transformers/data/metrics/squad_metrics.py | 7 ++++++-
 transformers/tokenization_xlm.py           | 4 ++++
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 2df29014ef..5e3f9663e2 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -299,10 +299,14 @@ def evaluate(args, model, tokenizer, prefix=""):
 
     # XLNet and XLM use a more complex post-processing procedure
     if args.model_type in ['xlnet', 'xlm']:
+
+        start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
+        end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
+
         predictions = compute_predictions_log_probs(examples, features, all_results, args.n_best_size,
                         args.max_answer_length, output_prediction_file,
                         output_nbest_file, output_null_log_odds_file,
-                        model.config.start_n_top, model.config.end_n_top,
+                        start_n_top, end_n_top,
                         args.version_2_with_negative, tokenizer, args.verbose_logging)
     else:
         predictions = compute_predictions_logits(examples, features, all_results, args.n_best_size,
diff --git a/transformers/data/metrics/squad_metrics.py b/transformers/data/metrics/squad_metrics.py
index 0755c0ab7a..7b03255f49 100644
--- a/transformers/data/metrics/squad_metrics.py
+++ b/transformers/data/metrics/squad_metrics.py
@@ -695,7 +695,12 @@ def compute_predictions_log_probs(
             tok_text = " ".join(tok_text.split())
             orig_text = " ".join(orig_tokens)
 
-            final_text = get_final_text(tok_text, orig_text, tokenizer.do_lower_case,
+            if hasattr(tokenizer, "do_lower_case"):
+                do_lower_case = tokenizer.do_lower_case
+            else:
+                do_lower_case = tokenizer.do_lowercase_and_remove_accent
+
+            final_text = get_final_text(tok_text, orig_text, do_lower_case,
                                         verbose_logging)
 
             if final_text in seen_predictions:
diff --git a/transformers/tokenization_xlm.py b/transformers/tokenization_xlm.py
index 6c9f8e5e5c..8def80bec4 100644
--- a/transformers/tokenization_xlm.py
+++ b/transformers/tokenization_xlm.py
@@ -549,6 +549,10 @@ class XLMTokenizer(PreTrainedTokenizer):
                                            additional_special_tokens=additional_special_tokens,
                                            **kwargs)
 
+
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens 
+
         # cache of sm.MosesPunctNormalizer instance
         self.cache_moses_punct_normalizer = dict()
         # cache of sm.MosesTokenizer instance

From 6a73382706ce3c6905023872f63a680f0eb419a4 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 10 Dec 2019 14:33:24 -0500
Subject: [PATCH 033/110] Complete warning + cleanup

---
 examples/run_squad.py              | 1 -
 transformers/tokenization_utils.py | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 5e3f9663e2..79c8537a4b 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -299,7 +299,6 @@ def evaluate(args, model, tokenizer, prefix=""):
 
     # XLNet and XLM use a more complex post-processing procedure
     if args.model_type in ['xlnet', 'xlm']:
-
         start_n_top = model.config.start_n_top if hasattr(model, "config") else model.module.config.start_n_top
         end_n_top = model.config.end_n_top if hasattr(model, "config") else model.module.config.end_n_top
 
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index f4395cd82c..cb931b0eaf 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -226,7 +226,7 @@ class PreTrainedTokenizer(object):
 
         self.max_len = max_len if max_len is not None else int(1e12)
 
-        # Padding side is right by default and over-riden in subclsses. If specified in the kwargs, it is changed.
+        # Padding side is right by default and over-riden in subclasses. If specified in the kwargs, it is changed.
         self.padding_side = kwargs.pop('padding_side', self.padding_side)
         
         # Added tokens
@@ -1003,7 +1003,7 @@ class PreTrainedTokenizer(object):
         )
 
         if pad_to_max_length and max_length is None and self.max_len > 10000:
-            logger.warning("Sequence can't be padded as the maximum  ")
+            logger.warning("Sequence can't be padded as no maximum length is specified and the model maximum length is too high.")
 
         if needs_to_be_padded:
             difference = (max_length if max_length is not None else self.max_len) - len(encoded_inputs["input_ids"])

From 58d75aa310e872723ba92ee1f0cb575ae9e2eaef Mon Sep 17 00:00:00 2001
From: Leo Dirac <deepembedding@gmail.com>
Date: Tue, 10 Dec 2019 11:36:56 -0800
Subject: [PATCH 034/110] Progress indicator improvements when downloading
 pre-trained models.

---
 transformers/file_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/transformers/file_utils.py b/transformers/file_utils.py
index 24abd60781..68de4e6e2f 100644
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -21,7 +21,7 @@ import boto3
 from botocore.config import Config
 from botocore.exceptions import ClientError
 import requests
-from tqdm import tqdm
+from tqdm.auto import tqdm
 from contextlib import contextmanager
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
@@ -245,7 +245,7 @@ def http_get(url, temp_file, proxies=None, resume_size=0):
         return
     content_length = response.headers.get('Content-Length')
     total = resume_size + int(content_length) if content_length is not None else None
-    progress = tqdm(unit="B", total=total, initial=resume_size)
+    progress = tqdm(unit="B", unit_scale=True, total=total, initial=resume_size, desc="Downloading")
     for chunk in response.iter_content(chunk_size=1024):
         if chunk: # filter out keep-alive new chunks
             progress.update(len(chunk))

From fafd4c86ecb63bb90b095bbd23453553e33fe99d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 11 Dec 2019 13:47:27 +0100
Subject: [PATCH 035/110] fix TF 2.0 version of T5 - update conversion script

---
 .../convert_pytorch_checkpoint_to_tf2.py      | 11 ++---
 transformers/file_utils.py                    |  3 ++
 transformers/modeling_t5.py                   | 21 +++++++--
 transformers/modeling_tf_t5.py                | 43 ++++++++++++-------
 transformers/modeling_tf_utils.py             |  6 +--
 transformers/modeling_utils.py                | 12 +++++-
 6 files changed, 65 insertions(+), 31 deletions(-)

diff --git a/transformers/convert_pytorch_checkpoint_to_tf2.py b/transformers/convert_pytorch_checkpoint_to_tf2.py
index 76d75b43e4..4a9832f123 100644
--- a/transformers/convert_pytorch_checkpoint_to_tf2.py
+++ b/transformers/convert_pytorch_checkpoint_to_tf2.py
@@ -120,24 +120,21 @@ def convert_pt_checkpoint_to_tf(model_type, pytorch_checkpoint_path, config_file
     tf_model = load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path)
 
     if compare_with_pt_model:
-        inputs_list = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-        tf_inputs = tf_model.dummy_inputs
-        tfo = tf_model(tf_inputs, training=False)  # build the network
+        tfo = tf_model(tf_model.dummy_inputs, training=False)  # build the network
 
         state_dict = torch.load(pytorch_checkpoint_path, map_location='cpu')
         pt_model = pt_model_class.from_pretrained(pretrained_model_name_or_path=None,
                                                   config=config,
                                                   state_dict=state_dict)
 
-        pt_inputs = torch.tensor(inputs_list)
         with torch.no_grad():
-            pto = pt_model(pt_inputs)
+            pto = pt_model(**pt_model.dummy_inputs)
 
-        np_pt = pto[0].detach().numpy()
+        np_pt = pto[0].numpy()
         np_tf = tfo[0].numpy()
         diff = np.amax(np.abs(np_pt - np_tf))
         print("Max absolute difference between models outputs {}".format(diff))
-        assert diff <= 2e-2, "Error, model absolute difference is >2e-2"
+        assert diff <= 2e-2, "Error, model absolute difference is >2e-2: {}".format(diff)
 
     # Save pytorch-model
     print("Save TensorFlow model to {}".format(tf_dump_path))
diff --git a/transformers/file_utils.py b/transformers/file_utils.py
index 24abd60781..e36bbf4eeb 100644
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -73,6 +73,9 @@ TF2_WEIGHTS_NAME = 'tf_model.h5'
 TF_WEIGHTS_NAME = 'model.ckpt'
 CONFIG_NAME = "config.json"
 
+DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
+DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
+
 def is_torch_available():
     return _torch_available
 
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index ffc4d8bb3f..149b977abc 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -32,7 +32,7 @@ from torch.nn import CrossEntropyLoss, MSELoss
 
 from .modeling_utils import PreTrainedModel
 from .configuration_t5 import T5Config
-from .file_utils import add_start_docstrings
+from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
 
 logger = logging.getLogger(__name__)
 
@@ -451,6 +451,15 @@ class T5PreTrainedModel(PreTrainedModel):
     load_tf_weights = load_tf_weights_in_t5
     base_model_prefix = "transformer"
 
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {'decoder_input_ids': input_ids,
+                        'encoder_input_ids': input_ids,
+                        'decoder_attention_mask': input_mask}
+        return dummy_inputs
+
     def _init_weights(self, module):
         """ Initialize the weights """
         factor = self.config.initializer_factor  # Used for testing weights initialization
@@ -534,9 +543,10 @@ class T5Stack(T5PreTrainedModel):
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
 
-        # T5 has a mask that can compare sequence ids, we simulate this here with this transposistion
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
         # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
-        extended_attention_mask = (extended_attention_mask == extended_attention_mask.transpose(-1, -2))
+        # extended_attention_mask = (extended_attention_mask == extended_attention_mask.transpose(-1, -2))
+
         extended_attention_mask = extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
         extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
 
@@ -548,6 +558,10 @@ class T5Stack(T5PreTrainedModel):
             if encoder_attention_mask.dim() == 2:
                 encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
 
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposition
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = (encoder_extended_attention_mask == encoder_extended_attention_mask.transpose(-1, -2))
+
             encoder_extended_attention_mask = encoder_extended_attention_mask.to(dtype=next(self.parameters()).dtype)  # fp16 compatibility
             encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
         else:
@@ -590,6 +604,7 @@ class T5Stack(T5PreTrainedModel):
             hidden_states = layer_outputs[0]
             if i == 0:
                 # We share the position biases between the layers - the first layer store them
+                # layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
                 position_bias = layer_outputs[2 if self.output_attentions else 1]
                 if self.is_decoder:
                     encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
index 0b3b1116f2..fd25328ac6 100644
--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
@@ -26,7 +26,7 @@ import tensorflow as tf
 
 from .configuration_t5 import T5Config
 from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, shape_list
-from .file_utils import add_start_docstrings
+from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
 
 logger = logging.getLogger(__name__)
 
@@ -61,7 +61,7 @@ class TFT5LayerNorm(tf.keras.layers.Layer):
         super(TFT5LayerNorm, self).build(input_shape)
 
     def call(self, x):
-        variance = tf.math.reduce_min(tf.math.square(x), axis=-1, keepdims=True)
+        variance = tf.math.reduce_mean(tf.math.square(x), axis=-1, keepdims=True)
         x = x * tf.math.rsqrt(variance + self.variance_epsilon)
         return self.weight * x
 
@@ -231,19 +231,19 @@ class TFT5Attention(tf.keras.layers.Layer):
             cache[self.layer_id] = (k, v)
 
         # q = q / math.sqrt(dim_per_head)                                     # No scaling in T5
-        scores = tf.matmul(q, k, transpose_b=True)                            # (bs, n_heads, qlen, klen)
+        # scores = tf.matmul(q, k, transpose_b=True)                            # (bs, n_heads, qlen, klen)
+        scores = tf.einsum('bnqd,bnkd->bnqk', q, k)                        # (bs, n_heads, qlen, klen)
 
         if position_bias is None:
             if not self.has_relative_attention_bias:
                 raise ValueError("No position_bias provided and no weights to compute position_bias")
             position_bias = self.compute_bias(qlen, klen)
+            if mask is not None:
+                position_bias = position_bias + mask
+                # mask = (mask == 0).expand_as(scores)                              # (bs, n_heads, qlen, klen)
+                # scores.masked_fill_(mask, -float('inf'))                          # (bs, n_heads, qlen, klen)
+
         scores += position_bias
-
-        if mask is not None:
-            scores += mask
-            # mask = (mask == 0).expand_as(scores)                              # (bs, n_heads, qlen, klen)
-            # scores.masked_fill_(mask, -float('inf'))                          # (bs, n_heads, qlen, klen)
-
         weights = tf.nn.softmax(scores, axis=-1)                              # (bs, n_heads, qlen, klen)
         weights = self.dropout(weights, training=training)                    # (bs, n_heads, qlen, klen)
 
@@ -350,11 +350,11 @@ class TFT5Block(tf.keras.layers.Layer):
                                                     head_mask=head_mask,
                                                     training=training)
             hidden_states = cross_attention_outputs[0]
-            outputs = cross_attention_outputs[1:] + outputs
+            outputs = outputs + cross_attention_outputs[1:]
             hidden_states = self.layer[2](hidden_states, training=training)
 
         outputs = (hidden_states,) + outputs  # add attentions if we output them
-        return outputs
+        return outputs # hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
 
 
 ####################################################
@@ -418,7 +418,13 @@ class TFT5MainLayer(tf.keras.layers.Layer):
         # positions we want to attend and -10000.0 for masked positions.
         # Since we are adding it to the raw scores before the softmax, this is
         # effectively the same as removing these entirely.
-        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+
+        # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion
+        # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+        # extended_attention_mask = tf.math.equal(extended_attention_mask,
+        #                                         tf.transpose(extended_attention_mask, perm=(-1, -2)))
+
+        extended_attention_mask = (1.0 - extended_attention_mask) * -1e9
 
         if self.is_decoder:
             # If a 2D ou 3D attention mask is provided for the cross-attention
@@ -430,7 +436,12 @@ class TFT5MainLayer(tf.keras.layers.Layer):
             if num_dims_encoder_attention_mask == 2:
                 encoder_extended_attention_mask = encoder_attention_mask[:, None, None, :]
 
-            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -10000.0
+            # T5 has a mask that can compare sequence ids, we can simulate this here with this transposistion
+            # Cf. https://github.com/tensorflow/mesh/blob/8d2465e9bc93129b913b5ccc6a59aa97abd96ec6/mesh_tensorflow/transformer/transformer_layers.py#L270
+            # encoder_extended_attention_mask = tf.math.equal(encoder_extended_attention_mask,
+            #                                         tf.transpose(encoder_extended_attention_mask, perm=(-1, -2)))
+
+            encoder_extended_attention_mask = (1.0 - encoder_extended_attention_mask) * -1e9
         else:
             encoder_extended_attention_mask = None
 
@@ -463,6 +474,8 @@ class TFT5MainLayer(tf.keras.layers.Layer):
                                          training=training)
             hidden_states = layer_outputs[0]
             if i == 0:
+                # We share the position biases between the layers - the first layer store them
+                # layer_outputs = hidden-states, (self-attention weights), (self-attention position bias), (cross-attention weights), (cross-attention position bias)
                 position_bias = layer_outputs[2 if self.output_attentions else 1]
                 if self.is_decoder:
                     encoder_decoder_position_bias = layer_outputs[4 if self.output_attentions else 2]
@@ -502,8 +515,8 @@ class TFT5PreTrainedModel(TFPreTrainedModel):
 
     @property
     def dummy_inputs(self):
-        input_ids = tf.constant([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
-        input_mask = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        input_ids = tf.constant(DUMMY_INPUTS)
+        input_mask = tf.constant(DUMMY_MASK)
         dummy_inputs = {'decoder_input_ids': input_ids,
                         'encoder_input_ids': input_ids,
                         'decoder_attention_mask': input_mask}
diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
index ed8fdb74c9..8d010e589e 100644
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -24,13 +24,11 @@ import os
 import tensorflow as tf
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
+from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME, DUMMY_INPUTS
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 
 logger = logging.getLogger(__name__)
 
-DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
-
 class TFPreTrainedModel(tf.keras.Model):
     r""" Base class for all TF models.
 
@@ -59,7 +57,7 @@ class TFPreTrainedModel(tf.keras.Model):
         Returns:
             tf.Tensor with dummy inputs
         """
-        return tf.constant(DUMMY_INPUTS)
+        return {'input_ids': tf.constant(DUMMY_INPUTS)}
 
     def __init__(self, config, *inputs, **kwargs):
         super(TFPreTrainedModel, self).__init__(*inputs, **kwargs)
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index aa0e0e6191..ae515d6870 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -31,11 +31,10 @@ from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
+from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME, DUMMY_INPUTS
 
 logger = logging.getLogger(__name__)
 
-
 try:
     from torch.nn import Identity
 except ImportError:
@@ -71,6 +70,15 @@ class PreTrainedModel(nn.Module):
     load_tf_weights = lambda model, config, path: None
     base_model_prefix = ""
 
+    @property
+    def dummy_inputs(self):
+        """ Dummy inputs to do a forward pass in the network.
+
+        Returns:
+            torch.Tensor with dummy inputs
+        """
+        return {'input_ids': torch.tensor(DUMMY_INPUTS)}
+
     def __init__(self, config, *inputs, **kwargs):
         super(PreTrainedModel, self).__init__()
         if not isinstance(config, PretrainedConfig):

From 4c12860f7ae61659aed2675498350a386fc4e122 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 11 Dec 2019 09:22:37 -0500
Subject: [PATCH 036/110] Remove misleading documentation

---
 transformers/tokenization_utils.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index cb931b0eaf..68a767fe82 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -628,7 +628,6 @@ class PreTrainedTokenizer(object):
             Take care of added tokens.
 
             text: The sequence to be encoded.
-            return_tokens_mapped_to_origin: (optional) Set to True to return the index of each token in the initial whitespace tokenization. (default False).
             **kwargs: passed to the child `self.tokenize()` method
         """
         def lowercase_text(t):

From 2e2f9fed554bb5f147ea3d9573004b447dd7c9e7 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 11 Dec 2019 11:11:56 -0500
Subject: [PATCH 037/110] rm duplicate imports

---
 transformers/modeling_auto.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py
index b63e43d73b..6ba1aab7a3 100644
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -28,7 +28,6 @@ from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassifica
 from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
 from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
 from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
-from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
 from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering
 
 from .modeling_utils import PreTrainedModel, SequenceSummary

From 29570db25ba9dd30e5ac9be68dbcad95434964ec Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Wed, 11 Dec 2019 17:19:18 +0100
Subject: [PATCH 038/110] allowing from_pretrained to load from url directly

---
 transformers/modeling_tf_utils.py | 4 +++-
 transformers/modeling_utils.py    | 7 +++++--
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
index ed8fdb74c9..e7512b5bd6 100644
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -259,8 +259,10 @@ class TFPreTrainedModel(tf.keras.Model):
                         pretrained_model_name_or_path))
             elif os.path.isfile(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
+            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
+                archive_file = pretrained_model_name_or_path + ".index"
             else:
-                raise EnvironmentError("Error file {} not found".format(pretrained_model_name_or_path))
+                archive_file = pretrained_model_name_or_path
 
             # redirect to the cache, if necessary
             try:
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 3ac568771e..9e7ca8d689 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -365,9 +365,12 @@ class PreTrainedModel(nn.Module):
                         pretrained_model_name_or_path))
             elif os.path.isfile(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
-            else:
-                assert from_tf, "Error finding file {}, no file or TF 1.X checkpoint found".format(pretrained_model_name_or_path)
+            elif os.path.isfile(pretrained_model_name_or_path + ".index"):
+                assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
+                    pretrained_model_name_or_path + ".index")
                 archive_file = pretrained_model_name_or_path + ".index"
+            else:
+                archive_file = pretrained_model_name_or_path
 
             # redirect to the cache, if necessary
             try:

From 030faccb8d45be9bdd2b4b80ff26f36dc41f622a Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Wed, 11 Dec 2019 17:44:21 +0100
Subject: [PATCH 039/110] doc: fix pretrained models table

---
 docs/source/pretrained_models.rst | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index dd61f11769..2fe1f8a314 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -169,35 +169,35 @@ Here is the full list of the currently provided pretrained models together with
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | ALBERT            | ``albert-base-v1``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
 |                   |                                                            | | ALBERT base model                                                                                                                   |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-large-v1``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
 |                   |                                                            | | ALBERT large model                                                                                                                  |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xlarge-v1``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
 |                   |                                                            | | ALBERT xlarge model                                                                                                                 |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xxlarge-v1``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
 |                   |                                                            | | ALBERT xxlarge model                                                                                                                |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-base-v2``                                         | | 12 repeating layers, 128 embedding, 768-hidden, 12-heads, 11M parameters                                                            |
 |                   |                                                            | | ALBERT base model with no dropout, additional training data and longer training                                                     |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-large-v2``                                        | | 24 repeating layers, 128 embedding, 1024-hidden, 16-heads, 17M parameters                                                           |
 |                   |                                                            | | ALBERT large model with no dropout, additional training data and longer training                                                    |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xlarge-v2``                                       | | 24 repeating layers, 128 embedding, 2048-hidden, 16-heads, 58M parameters                                                           |
 |                   |                                                            | | ALBERT xlarge model with no dropout, additional training data and longer training                                                   |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``albert-xxlarge-v2``                                      | | 12 repeating layer, 128 embedding, 4096-hidden, 64-heads, 223M parameters                                                           |
 |                   |                                                            | | ALBERT xxlarge model with no dropout, additional training data and longer training                                                  |
-|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                             |
+|                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 
 

From c999a3e5050f1dc93d814abf352f3bf0c06572e7 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 11 Dec 2019 12:29:58 -0500
Subject: [PATCH 040/110] Allow from_pretrained to take a remote identifier

---
 transformers/configuration_utils.py |  8 +++++---
 transformers/file_utils.py          | 20 ++++++++++++++++----
 transformers/modeling_utils.py      |  8 +++++---
 transformers/tokenization_utils.py  | 10 +++++-----
 4 files changed, 31 insertions(+), 15 deletions(-)

diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py
index 08cee75d81..8ae30f2a48 100644
--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -24,7 +24,7 @@ import logging
 import os
 from io import open
 
-from .file_utils import cached_path, CONFIG_NAME
+from .file_utils import CONFIG_NAME, cached_path, is_remote_url, hf_bucket_url
 
 logger = logging.getLogger(__name__)
 
@@ -131,8 +131,10 @@ class PretrainedConfig(object):
             config_file = cls.pretrained_config_archive_map[pretrained_model_name_or_path]
         elif os.path.isdir(pretrained_model_name_or_path):
             config_file = os.path.join(pretrained_model_name_or_path, CONFIG_NAME)
-        else:
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
             config_file = pretrained_model_name_or_path
+        else:
+            config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
         # redirect to the cache, if necessary
         try:
             resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
@@ -187,7 +189,7 @@ class PretrainedConfig(object):
 
     @classmethod
     def from_json_file(cls, json_file):
-        """Constructs a `BertConfig` from a json file of parameters."""
+        """Constructs a `Config` from a json file of parameters."""
         with open(json_file, "r", encoding='utf-8') as reader:
             text = reader.read()
         return cls.from_dict(json.loads(text))
diff --git a/transformers/file_utils.py b/transformers/file_utils.py
index 68de4e6e2f..5fd5e2ee39 100644
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -73,6 +73,8 @@ TF2_WEIGHTS_NAME = 'tf_model.h5'
 TF_WEIGHTS_NAME = 'model.ckpt'
 CONFIG_NAME = "config.json"
 
+S3_BUCKET_PREFIX = "https://s3.amazonaws.com/models.huggingface.co/bert"
+
 def is_torch_available():
     return _torch_available
 
@@ -103,6 +105,18 @@ else:
             return fn
         return docstring_decorator
 
+
+def is_remote_url(url_or_filename):
+    parsed = urlparse(url_or_filename)
+    return parsed.scheme in ('http', 'https', 's3')
+
+def hf_bucket_url(identifier, postfix=None):
+    if postfix is None:
+        return "/".join((S3_BUCKET_PREFIX, identifier))
+    else:
+        return "/".join((S3_BUCKET_PREFIX, identifier, postfix))
+
+
 def url_to_filename(url, etag=None):
     """
     Convert `url` into a hashed filename in a repeatable way.
@@ -171,9 +185,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
     if sys.version_info[0] == 3 and isinstance(cache_dir, Path):
         cache_dir = str(cache_dir)
 
-    parsed = urlparse(url_or_filename)
-
-    if parsed.scheme in ('http', 'https', 's3'):
+    if is_remote_url(url_or_filename):
         # URL, so get it from the cache (downloading if necessary)
         return get_from_cache(url_or_filename, cache_dir=cache_dir,
             force_download=force_download, proxies=proxies,
@@ -181,7 +193,7 @@ def cached_path(url_or_filename, cache_dir=None, force_download=False, proxies=N
     elif os.path.exists(url_or_filename):
         # File, and it exists.
         return url_or_filename
-    elif parsed.scheme == '':
+    elif urlparse(url_or_filename).scheme == '':
         # File, but it doesn't exist.
         raise EnvironmentError("file {} not found".format(url_or_filename))
     else:
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 9e7ca8d689..eac4252336 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -31,7 +31,8 @@ from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
+from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME,
+                         cached_path, hf_bucket_url, is_remote_url)
 
 logger = logging.getLogger(__name__)
 
@@ -363,14 +364,15 @@ class PreTrainedModel(nn.Module):
                     raise EnvironmentError("Error no file named {} found in directory {} or `from_tf` set to False".format(
                         [WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME + ".index"],
                         pretrained_model_name_or_path))
-            elif os.path.isfile(pretrained_model_name_or_path):
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
             elif os.path.isfile(pretrained_model_name_or_path + ".index"):
                 assert from_tf, "We found a TensorFlow checkpoint at {}, please set from_tf to True to load from this checkpoint".format(
                     pretrained_model_name_or_path + ".index")
                 archive_file = pretrained_model_name_or_path + ".index"
             else:
-                archive_file = pretrained_model_name_or_path
+                archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME)
+                # todo do we want to support TF checkpoints here?
 
             # redirect to the cache, if necessary
             try:
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 68a767fe82..2b2cec0c15 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -25,7 +25,7 @@ import itertools
 import re
 from io import open
 
-from .file_utils import cached_path, is_tf_available, is_torch_available
+from .file_utils import cached_path, is_remote_url, hf_bucket_url, is_tf_available, is_torch_available
 
 if is_tf_available():
     import tensorflow as tf
@@ -327,12 +327,12 @@ class PreTrainedTokenizer(object):
                 if os.path.isdir(pretrained_model_name_or_path):
                     # If a directory is provided we look for the standard filenames
                     full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
-                else:
+                elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                     # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
                     full_file_name = pretrained_model_name_or_path
-                if not os.path.exists(full_file_name):
-                    logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
-                    full_file_name = None
+                else:
+                    full_file_name = hf_bucket_url(pretrained_model_name_or_path, postfix=file_name)
+                
                 vocab_files[file_id] = full_file_name
 
             # Look for the additional tokens files

From 3d57c51111054adb01b2ea94bfd45237eb282431 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 11 Dec 2019 15:10:17 -0500
Subject: [PATCH 041/110] Fix encode plus

---
 transformers/tokenization_utils.py | 39 ++++++++++++++++++------------
 1 file changed, 23 insertions(+), 16 deletions(-)

diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 68a767fe82..eace409555 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -916,7 +916,7 @@ class PreTrainedTokenizer(object):
             return_tensors: (optional) can be set to 'tf' or 'pt' to return respectively TensorFlow tf.constant
                 or PyTorch torch.Tensor instead of a list of python integers.
             return_token_type_ids: (optional) Set to False to avoid returning token_type_ids (default True).
-            return_attention_mask: (optional) Set to False to avoir returning attention mask (default True)
+            return_attention_mask: (optional) Set to False to avoid returning attention mask (default True)
             return_overflowing_tokens: (optional) Set to True to return overflowing token information (default False).
             return_special_tokens_mask: (optional) Set to True to return special tokens mask information (default False).
 
@@ -961,24 +961,13 @@ class PreTrainedTokenizer(object):
         if add_special_tokens:
             sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
             token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
-            special_tokens_mask = self.get_special_tokens_mask(ids, pair_ids)
         else:
             sequence = ids + pair_ids if pair else ids
             token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
-            special_tokens_mask = [0] * (len(ids) + (len(pair_ids) if pair else 0))
+
         if return_special_tokens_mask:
             encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
 
-        # Prepare inputs as tensors if asked
-        if return_tensors == 'tf' and is_tf_available():
-            sequence = tf.constant([sequence])
-            token_type_ids = tf.constant([token_type_ids])
-        elif return_tensors == 'pt' and is_torch_available():
-            sequence = torch.tensor([sequence])
-            token_type_ids = torch.tensor([token_type_ids])
-        elif return_tensors is not None:
-            logger.warning("Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(return_tensors))
-
         encoded_inputs["input_ids"] = sequence
         if return_token_type_ids:
             encoded_inputs["token_type_ids"] = token_type_ids
@@ -1015,10 +1004,9 @@ class PreTrainedTokenizer(object):
                 if return_special_tokens_mask:
                     encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
                 encoded_inputs["input_ids"] = encoded_inputs["input_ids"] + [self.pad_token_id] * difference
-
             elif self.padding_side == 'left':
                 if return_attention_mask:
-                    encoded_inputs["attention_mask"] =  [0] * difference + [1] * len(encoded_inputs["input_ids"])
+                    encoded_inputs["attention_mask"] = [0] * difference + [1] * len(encoded_inputs["input_ids"])
                 if return_token_type_ids:
                     encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs["token_type_ids"]
                 if return_special_tokens_mask:
@@ -1030,7 +1018,26 @@ class PreTrainedTokenizer(object):
             
         elif return_attention_mask:
             encoded_inputs["attention_mask"] = [1] * len(encoded_inputs["input_ids"])
-            
+
+        # Prepare inputs as tensors if asked
+        if return_tensors == 'tf' and is_tf_available():
+            encoded_inputs["input_ids"] = tf.constant([encoded_inputs["input_ids"]])
+            encoded_inputs["token_type_ids"] = tf.constant([encoded_inputs["token_type_ids"]])
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = tf.constant([encoded_inputs["attention_mask"]])
+
+        elif return_tensors == 'pt' and is_torch_available():
+            encoded_inputs["input_ids"] = torch.tensor([encoded_inputs["input_ids"]])
+            encoded_inputs["token_type_ids"] = torch.tensor([encoded_inputs["token_type_ids"]])
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = torch.tensor([encoded_inputs["attention_mask"]])
+        elif return_tensors is not None:
+            logger.warning(
+                "Unable to convert output to tensors format {}, PyTorch or TensorFlow is not available.".format(
+                    return_tensors))
+
         return encoded_inputs
 
     def truncate_sequences(self, ids, pair_ids=None, num_tokens_to_remove=0, truncation_strategy='longest_first', stride=0):

From 31e5b5ff2276c61af7eebb4c353934f8f675d728 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 11 Dec 2019 15:22:02 -0500
Subject: [PATCH 042/110] Fix tests + first example of doc

---
 transformers/tokenization_utils.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 2b2cec0c15..63d2cc5cb4 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -255,6 +255,7 @@ class PreTrainedTokenizer(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmz/bert-base-german-cased``.
                 - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
 
@@ -282,6 +283,9 @@ class PreTrainedTokenizer(object):
             # Download vocabulary from S3 and cache.
             tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 
+            # Download vocabulary from S3 (user-uploaded) and cache.
+            tokenizer = BertTokenizer.from_pretrained('dbmz/bert-base-german-cased')
+
             # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
             tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')
 
@@ -327,6 +331,9 @@ class PreTrainedTokenizer(object):
                 if os.path.isdir(pretrained_model_name_or_path):
                     # If a directory is provided we look for the standard filenames
                     full_file_name = os.path.join(pretrained_model_name_or_path, file_name)
+                    if not os.path.exists(full_file_name):
+                        logger.info("Didn't find file {}. We won't load it.".format(full_file_name))
+                        full_file_name = None
                 elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                     # If a path to a file is provided we use it (will only work for non-BPE tokenizer using a single vocabulary file)
                     full_file_name = pretrained_model_name_or_path

From 18e1f751f1d996c4fe01559ade1cd013186b81e4 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 11 Dec 2019 17:07:46 -0500
Subject: [PATCH 043/110] TF support

---
 transformers/modeling_tf_utils.py | 9 ++++++---
 transformers/modeling_utils.py    | 3 ++-
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
index e7512b5bd6..4a6d18f447 100644
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -24,7 +24,8 @@ import os
 import tensorflow as tf
 
 from .configuration_utils import PretrainedConfig
-from .file_utils import cached_path, WEIGHTS_NAME, TF_WEIGHTS_NAME, TF2_WEIGHTS_NAME
+from .file_utils import (TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, WEIGHTS_NAME,
+                         cached_path, hf_bucket_url, is_remote_url)
 from .modeling_tf_pytorch_utils import load_pytorch_checkpoint_in_tf2_model
 
 logger = logging.getLogger(__name__)
@@ -257,12 +258,14 @@ class TFPreTrainedModel(tf.keras.Model):
                     raise EnvironmentError("Error no file named {} found in directory {} or `from_pt` set to False".format(
                         [WEIGHTS_NAME, TF2_WEIGHTS_NAME],
                         pretrained_model_name_or_path))
-            elif os.path.isfile(pretrained_model_name_or_path):
+            elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
                 archive_file = pretrained_model_name_or_path
             elif os.path.isfile(pretrained_model_name_or_path + ".index"):
                 archive_file = pretrained_model_name_or_path + ".index"
             else:
-                archive_file = pretrained_model_name_or_path
+                archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=TF2_WEIGHTS_NAME)
+                if from_pt:
+                    raise EnvironmentError("Loading a TF model from a PyTorch checkpoint is not supported when using a model identifier name.")
 
             # redirect to the cache, if necessary
             try:
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index eac4252336..37088f8e67 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -372,7 +372,8 @@ class PreTrainedModel(nn.Module):
                 archive_file = pretrained_model_name_or_path + ".index"
             else:
                 archive_file = hf_bucket_url(pretrained_model_name_or_path, postfix=WEIGHTS_NAME)
-                # todo do we want to support TF checkpoints here?
+                if from_tf:
+                    raise EnvironmentError("Loading a PyTorch model from a TF checkpoint is not supported when using a model identifier name.")
 
             # redirect to the cache, if necessary
             try:

From 4f15e5a267201f86bdd9628cf58592d0e1cc86eb Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 11 Dec 2019 17:41:51 -0500
Subject: [PATCH 044/110] Add tests.

Maybe not the best possible place for the tests, lmk.
---
 transformers/tests/modeling_auto_test.py     | 7 ++++++-
 transformers/tests/modeling_tf_auto_test.py  | 7 ++++++-
 transformers/tests/tokenization_auto_test.py | 7 ++++++-
 transformers/tests/utils.py                  | 3 +++
 4 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/transformers/tests/modeling_auto_test.py b/transformers/tests/modeling_auto_test.py
index 9b7d920bc8..871a262fe8 100644
--- a/transformers/tests/modeling_auto_test.py
+++ b/transformers/tests/modeling_auto_test.py
@@ -22,7 +22,7 @@ import logging
 
 from transformers import is_torch_available
 
-from .utils import require_torch, slow
+from .utils import require_torch, slow, SMALL_MODEL_IDENTIFIER
 
 if is_torch_available():
     from transformers import (AutoConfig, BertConfig,
@@ -92,6 +92,11 @@ class AutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, BertForQuestionAnswering)
 
+    def test_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        model = AutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(model, BertForMaskedLM)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/modeling_tf_auto_test.py b/transformers/tests/modeling_tf_auto_test.py
index 7ea48015d9..7ab6eaa3d6 100644
--- a/transformers/tests/modeling_tf_auto_test.py
+++ b/transformers/tests/modeling_tf_auto_test.py
@@ -22,7 +22,7 @@ import logging
 
 from transformers import is_tf_available
 
-from .utils import require_tf, slow
+from .utils import require_tf, slow, SMALL_MODEL_IDENTIFIER
 
 if is_tf_available():
     from transformers import (AutoConfig, BertConfig,
@@ -93,6 +93,11 @@ class TFAutoModelTest(unittest.TestCase):
             self.assertIsNotNone(model)
             self.assertIsInstance(model, TFBertForQuestionAnswering)
 
+    def test_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER, force_download=True)
+        self.assertIsInstance(model, TFBertForMaskedLM)
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/tokenization_auto_test.py b/transformers/tests/tokenization_auto_test.py
index 18346d2768..0a894cac04 100644
--- a/transformers/tests/tokenization_auto_test.py
+++ b/transformers/tests/tokenization_auto_test.py
@@ -23,7 +23,7 @@ import logging
 from transformers import AutoTokenizer, BertTokenizer, AutoTokenizer, GPT2Tokenizer
 from transformers import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
 
-from .utils import slow
+from .utils import slow, SMALL_MODEL_IDENTIFIER
 
 
 class AutoTokenizerTest(unittest.TestCase):
@@ -42,6 +42,11 @@ class AutoTokenizerTest(unittest.TestCase):
             self.assertIsInstance(tokenizer, GPT2Tokenizer)
             self.assertGreater(len(tokenizer), 0)
 
+    def test_tokenizer_from_pretrained_identifier(self):
+        logging.basicConfig(level=logging.INFO)
+        tokenizer = AutoTokenizer.from_pretrained(SMALL_MODEL_IDENTIFIER)
+        self.assertIsInstance(tokenizer, BertTokenizer)
+        self.assertEqual(len(tokenizer), 12)
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py
index 7a51ab612b..3aff1daf83 100644
--- a/transformers/tests/utils.py
+++ b/transformers/tests/utils.py
@@ -6,6 +6,9 @@ from distutils.util import strtobool
 from transformers.file_utils import _tf_available, _torch_available
 
 
+SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
+
+
 try:
     run_slow = os.environ["RUN_SLOW"]
 except KeyError:

From c03c0dfd230a5174c536a58d6ba5e590ed1afcc4 Mon Sep 17 00:00:00 2001
From: Masatoshi Suzuki <sappukei.suzuki@gmail.com>
Date: Fri, 15 Nov 2019 17:24:56 +0900
Subject: [PATCH 045/110] Add support for Japanese BERT models by cl-tohoku

---
 docs/source/pretrained_models.rst          |  18 ++
 transformers/__init__.py                   |   1 +
 transformers/configuration_bert.py         |   4 +
 transformers/modeling_bert.py              |   8 +-
 transformers/modeling_tf_bert.py           |  16 +-
 transformers/tokenization_auto.py          |   3 +
 transformers/tokenization_bert_japanese.py | 247 +++++++++++++++++++++
 7 files changed, 289 insertions(+), 8 deletions(-)
 create mode 100644 transformers/tokenization_bert_japanese.py

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index 2fe1f8a314..d3498e057d 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -61,6 +61,24 @@ Here is the full list of the currently provided pretrained models together with
 |                   | ``bert-base-german-dbmdz-uncased``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on uncased German text by DBMDZ                                                                                             |
 |                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese``                                     | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece.                                                 |
+|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                            |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese-whole-word-masking``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece.                        |
+|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                            |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese-char``                                | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                          |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-japanese-char-whole-word-masking``             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                 |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | OpenAI GPT English model                                                                                                            |
diff --git a/transformers/__init__.py b/transformers/__init__.py
index f9a28add5f..5d7b0b772c 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -37,6 +37,7 @@ if is_sklearn_available():
 from .tokenization_utils import (PreTrainedTokenizer)
 from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
+from .tokenization_bert_japanese import BertJapaneseTokenizer, MecabTokenizer, CharacterTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py
index d63be963eb..16f1f60404 100644
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -42,6 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
+    'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-config.json",
+    'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-config.json",
+    'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-config.json",
+    'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-config.json"
 }
 
 
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index d84b0a1a7c..e2e115a015 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -48,6 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
+    'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-pytorch_model.bin",
+    'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-pytorch_model.bin",
+    'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-pytorch_model.bin",
+    'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-pytorch_model.bin"
 }
 
 
@@ -1233,9 +1237,9 @@ class BertForQuestionAnswering(BertPreTrainedModel):
         question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
         input_text = "[CLS] " + question + " [SEP] " + text + " [SEP]"
         input_ids = tokenizer.encode(input_text)
-        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))] 
+        token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
         start_scores, end_scores = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
-        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)  
+        all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
         print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))
         # a nice puppet
 
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index 5aa7bb3da2..27dd311a4d 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -48,6 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
+    'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-tf_model.h5",
+    'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-tf_model.h5",
+    'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-tf_model.h5",
+    'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-tf_model.h5"
 }
 
 
@@ -129,7 +133,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
                 linear tensor, float32 with shape [batch_size, length, vocab_size].
         Raises:
             ValueError: if mode is not valid.
-        
+
         Shared weights logic adapted from
             https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
         """
@@ -148,7 +152,7 @@ class TFBertEmbeddings(tf.keras.layers.Layer):
             input_shape = shape_list(input_ids)
         else:
             input_shape = shape_list(inputs_embeds)[:-1]
-        
+
         seq_length = input_shape[1]
         if position_ids is None:
             position_ids = tf.range(seq_length, dtype=tf.int32)[tf.newaxis, :]
@@ -246,7 +250,7 @@ class TFBertSelfAttention(tf.keras.layers.Layer):
         context_layer = tf.matmul(attention_probs, value_layer)
 
         context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
-        context_layer = tf.reshape(context_layer, 
+        context_layer = tf.reshape(context_layer,
                                   (batch_size, -1, self.all_head_size))  # (batch_size, seq_len_q, all_head_size)
 
         outputs = (context_layer, attention_probs) if self.output_attentions else (context_layer,)
@@ -591,7 +595,7 @@ BERT_START_DOCSTRING = r"""    The BERT model was proposed in
             `model({'input_ids': input_ids, 'token_type_ids': token_type_ids})`
 
     Parameters:
-        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+        config (:class:`~transformers.BertConfig`): Model configuration class with all the parameters of the model.
             Initializing with a config file does not load the weights associated with the model, only the configuration.
             Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
@@ -605,13 +609,13 @@ BERT_INPUTS_DOCSTRING = r"""
             (a) For sequence pairs:
 
                 ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]``
-                
+
                 ``token_type_ids:   0   0  0    0    0     0       0   0   1  1  1  1   1   1``
 
             (b) For single sequences:
 
                 ``tokens:         [CLS] the dog is hairy . [SEP]``
-                
+
                 ``token_type_ids:   0   0   0   0  0     0   0``
 
             Bert is a model with absolute position embeddings so it's usually advised to pad the inputs on
diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py
index b7c5046961..d63b7e783d 100644
--- a/transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import logging
 
 from .tokenization_bert import BertTokenizer
+from .tokenization_bert_japanese import BertJapaneseTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_ctrl import CTRLTokenizer
@@ -118,6 +119,8 @@ class AutoTokenizer(object):
             return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'bert-japanese' in pretrained_model_name_or_path:
+            return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
             return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'openai-gpt' in pretrained_model_name_or_path:
diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py
new file mode 100644
index 0000000000..8554a1c880
--- /dev/null
+++ b/transformers/tokenization_bert_japanese.py
@@ -0,0 +1,247 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes."""
+
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import collections
+import logging
+import os
+import unicodedata
+from io import open
+
+from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer, load_vocab
+from .tokenization_utils import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-vocab.txt",
+        'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-vocab.txt",
+        'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-vocab.txt",
+        'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-vocab.txt"
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'bert-base-japanese': 512,
+    'bert-base-japanese-whole-word-masking': 512,
+    'bert-base-japanese-char': 512,
+    'bert-base-japanese-char-whole-word-masking': 512
+}
+
+PRETRAINED_INIT_CONFIGURATION = {
+    'bert-base-japanese': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'wordpiece'
+    },
+    'bert-base-japanese-whole-word-masking':{
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'wordpiece'
+    },
+    'bert-base-japanese-char': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'character'
+    },
+    'bert-base-japanese-char-whole-word-masking': {
+        'do_lower_case': False,
+        'word_tokenizer_type': 'mecab',
+        'subword_tokenizer_type': 'character'
+    }
+}
+
+
+class BertJapaneseTokenizer(BertTokenizer):
+    """BERT tokenizer for Japanese text"""
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    pretrained_init_configuration = PRETRAINED_INIT_CONFIGURATION
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, do_lower_case=False,
+                 do_word_tokenize=True, do_subword_tokenize=True,
+                 word_tokenizer_type='basic', subword_tokenizer_type='wordpiece',
+                 never_split=None, unk_token='[UNK]', sep_token='[SEP]',
+                 pad_token='[PAD]', cls_token='[CLS]', mask_token='[MASK]', **kwargs):
+        """Constructs a MecabBertTokenizer.
+
+        Args:
+            **vocab_file**: Path to a one-wordpiece-per-line vocabulary file.
+            **do_lower_case**: (`optional`) boolean (default True)
+                Whether to lower case the input.
+                Only has an effect when do_basic_tokenize=True.
+            **do_word_tokenize**: (`optional`) boolean (default True)
+                Whether to do word tokenization.
+            **do_subword_tokenize**: (`optional`) boolean (default True)
+                Whether to do subword tokenization.
+            **word_tokenizer_type**: (`optional`) string (default "basic")
+                Type of word tokenizer.
+            **subword_tokenizer_type**: (`optional`) string (default "wordpiece")
+                Type of subword tokenizer.
+        """
+        super(BertTokenizer, self).__init__(unk_token=unk_token, sep_token=sep_token,
+                                            pad_token=pad_token, cls_token=cls_token,
+                                            mask_token=mask_token, **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 3  # take into account special tokens
+
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                "Can't find a vocabulary file at path '{}'. To load the vocabulary from a Google pretrained "
+                "model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`".format(vocab_file))
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict(
+            [(ids, tok) for tok, ids in self.vocab.items()])
+
+        self.do_word_tokenize = do_word_tokenize
+        if do_word_tokenize:
+            if word_tokenizer_type == 'basic':
+                self.word_tokenizer = BasicTokenizer(do_lower_case=do_lower_case,
+                                                     never_split=never_split,
+                                                     tokenize_chinese_chars=False)
+            elif word_tokenizer_type == 'mecab':
+                self.word_tokenizer = MecabTokenizer(do_lower_case=do_lower_case,
+                                                     never_split=never_split)
+            else:
+                raise ValueError(
+                    "Invalid word_tokenizer_type '{}' is specified.".format(word_tokenizer_type))
+
+        self.do_subword_tokenize = do_subword_tokenize
+        if do_subword_tokenize:
+            if subword_tokenizer_type == 'wordpiece':
+                self.subword_tokenizer = WordpieceTokenizer(vocab=self.vocab,
+                                                            unk_token=self.unk_token)
+            elif subword_tokenizer_type == 'character':
+                self.subword_tokenizer = CharacterTokenizer(vocab=self.vocab,
+                                                            unk_token=self.unk_token)
+            else:
+                raise ValueError(
+                    "Invalid subword_tokenizer_type '{}' is specified.".format(subword_tokenizer_type))
+
+
+    def _tokenize(self, text):
+        if self.do_word_tokenize:
+            tokens = self.word_tokenizer.tokenize(text,
+                                                  never_split=self.all_special_tokens)
+        else:
+            tokens = [text]
+
+        if self.do_subword_tokenize:
+            split_tokens = [sub_token for token in tokens
+                            for sub_token in self.subword_tokenizer.tokenize(token)]
+        else:
+            split_tokens = tokens
+
+        return split_tokens
+
+
+class MecabTokenizer(object):
+    """Runs basic tokenization with MeCab morphological parser."""
+
+    def __init__(self, do_lower_case=False, never_split=None, normalize_text=True):
+        """Constructs a MecabTokenizer.
+
+        Args:
+            **do_lower_case**: (`optional`) boolean (default True)
+                Whether to lower case the input.
+            **never_split**: (`optional`) list of str
+                Kept for backward compatibility purposes.
+                Now implemented directly at the base class level (see :func:`PreTrainedTokenizer.tokenize`)
+                List of token not to split.
+            **normalize_text**: (`optional`) boolean (default True)
+                Whether to apply unicode normalization to text before tokenization.
+        """
+        self.do_lower_case = do_lower_case
+        self.never_split = never_split if never_split is not None else []
+        self.normalize_text = normalize_text
+
+        import MeCab
+        self.mecab = MeCab.Tagger()
+
+    def tokenize(self, text, never_split=None, **kwargs):
+        """Tokenizes a piece of text."""
+        if self.normalize_text:
+            text = unicodedata.normalize('NFKC', text)
+
+        never_split = self.never_split + (never_split if never_split is not None else [])
+        tokens = []
+
+        cursor = 0
+        for line in self.mecab.parse(text).split('\n'):
+            if line == 'EOS':
+                break
+
+            token, _ = line.split('\t')
+            token_start = text.index(token, cursor)
+            token_end = token_start + len(token)
+            if self.do_lower_case and token not in never_split:
+                token = token.lower()
+
+            tokens.append(token)
+            cursor = token_end
+
+        return tokens
+
+
+class CharacterTokenizer(object):
+    """Runs Character tokenziation."""
+
+    def __init__(self, vocab, unk_token, normalize_text=True):
+        """Constructs a CharacterTokenizer.
+
+        Args:
+            **vocab**:
+                Vocabulary object.
+            **unk_token**: str
+                A special symbol for out-of-vocabulary token.
+            **normalize_text**: (`optional`) boolean (default True)
+                Whether to apply unicode normalization to text before tokenization.
+        """
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.normalize_text = normalize_text
+
+    def tokenize(self, text):
+        """Tokenizes a piece of text into characters.
+
+        For example:
+            input = "apple"
+            output = ["a", "p", "p", "l", "e"]
+        Args:
+            text: A single token or whitespace separated tokens.
+                This should have already been passed through `BasicTokenizer`.
+        Returns:
+            A list of characters.
+        """
+        if self.normalize_text:
+            text = unicodedata.normalize('NFKC', text)
+
+        output_tokens = []
+        for i, char in enumerate(text):
+            if char not in self.vocab:
+                output_tokens.append(self.unk_token)
+                continue
+
+            output_tokens.append(char)
+
+        return output_tokens

From 57b5cb3eaa850a212235fccbd4e5d002aede72b6 Mon Sep 17 00:00:00 2001
From: Masatoshi Suzuki <sappukei.suzuki@gmail.com>
Date: Wed, 20 Nov 2019 09:02:10 +0900
Subject: [PATCH 046/110] Fix loading BertJapaneseTokenizer

---
 transformers/tokenization_auto.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py
index d63b7e783d..f36a584521 100644
--- a/transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -73,6 +73,7 @@ class AutoTokenizer(object):
             - contains `albert`: AlbertTokenizer (ALBERT model)
             - contains `camembert`: CamembertTokenizer (CamemBERT model)
             - contains `roberta`: RobertaTokenizer (RoBERTa model)
+            - contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model)
             - contains `bert`: BertTokenizer (Bert model)
             - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
             - contains `gpt2`: GPT2Tokenizer (OpenAI GPT-2 model)
@@ -119,7 +120,7 @@ class AutoTokenizer(object):
             return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'bert-japanese' in pretrained_model_name_or_path:
+        elif 'bert-base-japanese' in pretrained_model_name_or_path:
             return BertJapaneseTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
             return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)

From a09da4eeb0397dd66d61182177dd3b753d70e62a Mon Sep 17 00:00:00 2001
From: Masatoshi Suzuki <sappukei.suzuki@gmail.com>
Date: Fri, 29 Nov 2019 19:24:43 +0900
Subject: [PATCH 047/110] Add a test for Japanese BERT tokenizers

---
 .../tests/tokenization_bert_japanese_test.py  | 192 ++++++++++++++++++
 1 file changed, 192 insertions(+)
 create mode 100644 transformers/tests/tokenization_bert_japanese_test.py

diff --git a/transformers/tests/tokenization_bert_japanese_test.py b/transformers/tests/tokenization_bert_japanese_test.py
new file mode 100644
index 0000000000..6f66b96411
--- /dev/null
+++ b/transformers/tests/tokenization_bert_japanese_test.py
@@ -0,0 +1,192 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import unittest
+import pytest
+from io import open
+
+from transformers.tokenization_bert import WordpieceTokenizer
+from transformers.tokenization_bert_japanese import (BertJapaneseTokenizer,
+                                                     MecabTokenizer, CharacterTokenizer,
+                                                     VOCAB_FILES_NAMES)
+
+from .tokenization_tests_commons import CommonTestCases
+
+
+class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = BertJapaneseTokenizer
+
+    def setUp(self):
+        super(BertJapaneseTokenizationTest, self).setUp()
+
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こんにちは", u"こん", u"にちは", u"ばんは", u"##こん", u"##にちは", u"##ばんは",
+            u"世界", u"##世界", u"、", u"##、", u"。", u"##。"]
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_tokenizer(self, **kwargs):
+        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname, **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"こんにちは、世界。 \nこんばんは、世界。"
+        output_text = u"こんにちは 、 世界 。 こんばんは 、 世界 。"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file)
+
+        tokens = tokenizer.tokenize(u"こんにちは、世界。\nこんばんは、世界。")
+        self.assertListEqual(tokens,
+                             [u"こんにちは", u"、", u"世界", u"。",
+                              u"こん", u"##ばんは", u"、", u"世界", "。"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
+                             [3, 12, 10, 14, 4, 9, 12, 10, 14])
+
+    def test_mecab_tokenizer(self):
+        tokenizer = MecabTokenizer()
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+                               [u"アップルストア", u"で", u"iPhone", u"8", u"が",
+                                u"発売", u"さ", u"れ", u"た", u"。"])
+
+    def test_mecab_tokenizer_lower(self):
+        tokenizer = MecabTokenizer(do_lower_case=True)
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+                               [u"アップルストア", u"で", u"iphone", u"8", u"が",
+                                u"発売", u"さ", u"れ", u"た", u"。"])
+
+    def test_mecab_tokenizer_no_normalize(self):
+        tokenizer = MecabTokenizer(normalize_text=False)
+
+        self.assertListEqual(
+            tokenizer.tokenize(u" \tｱｯﾌﾟﾙストアでiPhone８ が  \n 発売された　。  "),
+                               [u"ｱｯﾌﾟﾙストア", u"で", u"iPhone", u"８", u"が",
+                                u"発売", u"さ", u"れ", u"た", u"　", u"。"])
+
+    def test_wordpiece_tokenizer(self):
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こんにちは", u"こん", u"にちは" u"ばんは", u"##こん", u"##にちは", u"##ばんは"]
+
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = WordpieceTokenizer(vocab=vocab, unk_token=u"[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(u""), [])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
+                             [u"こんにちは"])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんばんは"),
+                             [u"こん", u"##ばんは"])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんばんは こんばんにちは こんにちは"),
+                             [u"こん", u"##ばんは", u"[UNK]", u"こんにちは"])
+
+    @pytest.mark.slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese")
+
+        text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
+        text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        # 2 is for "[CLS]", 3 is for "[SEP]"
+        assert encoded_sentence == [2] + text + [3]
+        assert encoded_pair == [2] + text + [3] + text_2 + [3]
+
+
+class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTester):
+
+    tokenizer_class = BertJapaneseTokenizer
+
+    def setUp(self):
+        super(BertJapaneseCharacterTokenizationTest, self).setUp()
+
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界", u"、", u"。"]
+
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"])
+        with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer:
+            vocab_writer.write("".join([x + "\n" for x in vocab_tokens]))
+
+    def get_tokenizer(self, **kwargs):
+        return BertJapaneseTokenizer.from_pretrained(self.tmpdirname,
+                                                     subword_tokenizer_type="character",
+                                                     **kwargs)
+
+    def get_input_output_texts(self):
+        input_text = u"こんにちは、世界。 \nこんばんは、世界。"
+        output_text = u"こ ん に ち は 、 世 界 。 こ ん ば ん は 、 世 界 。"
+        return input_text, output_text
+
+    def test_full_tokenizer(self):
+        tokenizer = self.tokenizer_class(self.vocab_file,
+                                         subword_tokenizer_type="character")
+
+        tokens = tokenizer.tokenize(u"こんにちは、世界。 \nこんばんは、世界。")
+        self.assertListEqual(tokens,
+            [u"こ", u"ん", u"に", u"ち", u"は", u"、", u"世", u"界", u"。",
+             u"こ", u"ん", u"ば", u"ん", u"は", u"、", u"世", u"界", u"。"])
+        self.assertListEqual(tokenizer.convert_tokens_to_ids(tokens),
+                             [3, 4, 5, 6, 7, 11, 9, 10, 12,
+                              3, 4, 8, 4, 7, 11, 9, 10, 12])
+
+    def test_character_tokenizer(self):
+        vocab_tokens = [u"[UNK]", u"[CLS]", u"[SEP]",
+            u"こ", u"ん", u"に", u"ち", u"は", u"ば", u"世", u"界"u"、", u"。"]
+
+        vocab = {}
+        for (i, token) in enumerate(vocab_tokens):
+            vocab[token] = i
+        tokenizer = CharacterTokenizer(vocab=vocab, unk_token=u"[UNK]")
+
+        self.assertListEqual(tokenizer.tokenize(u""), [])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんにちは"),
+                             [u"こ", u"ん", u"に", u"ち", u"は"])
+
+        self.assertListEqual(tokenizer.tokenize(u"こんにちほ"),
+                             [u"こ", u"ん", u"に", u"ち", u"[UNK]"])
+
+    @pytest.mark.slow
+    def test_sequence_builders(self):
+        tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char")
+
+        text = tokenizer.encode(u"ありがとう。", add_special_tokens=False)
+        text_2 = tokenizer.encode(u"どういたしまして。", add_special_tokens=False)
+
+        encoded_sentence = tokenizer.build_inputs_with_special_tokens(text)
+        encoded_pair = tokenizer.build_inputs_with_special_tokens(text, text_2)
+
+        # 2 is for "[CLS]", 3 is for "[SEP]"
+        assert encoded_sentence == [2] + text + [3]
+        assert encoded_pair == [2] + text + [3] + text_2 + [3]
+
+
+
+if __name__ == '__main__':
+    unittest.main()

From 6a43dc9d7d592362d144209097e1d93876f8e88a Mon Sep 17 00:00:00 2001
From: Masatoshi Suzuki <sappukei.suzuki@gmail.com>
Date: Thu, 5 Dec 2019 11:19:02 +0900
Subject: [PATCH 048/110] Support Python 2

---
 transformers/tokenization_bert_japanese.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py
index 8554a1c880..1ce0e1d1cb 100644
--- a/transformers/tokenization_bert_japanese.py
+++ b/transformers/tokenization_bert_japanese.py
@@ -19,6 +19,7 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 import collections
 import logging
 import os
+import six
 import unicodedata
 from io import open
 
@@ -186,8 +187,13 @@ class MecabTokenizer(object):
         never_split = self.never_split + (never_split if never_split is not None else [])
         tokens = []
 
+        if six.PY2:
+            mecab_output = self.mecab.parse(text.encode('utf-8')).decode('utf-8')
+        else:
+            mecab_output = self.mecab.parse(text)
+
         cursor = 0
-        for line in self.mecab.parse(text).split('\n'):
+        for line in mecab_output.split('\n'):
             if line == 'EOS':
                 break
 

From 597ba7feb384316081c96955196fcb7abb2edf06 Mon Sep 17 00:00:00 2001
From: Masatoshi Suzuki <sappukei.suzuki@gmail.com>
Date: Thu, 5 Dec 2019 11:30:40 +0900
Subject: [PATCH 049/110] Support testing Japanese BERT tokenizers

---
 .circleci/config.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 01e6d82b33..97f5f25606 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -13,6 +13,8 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
+            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
+            - run: sudo pip install mecab-python3
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py3_torch:
@@ -27,6 +29,8 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
+            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
+            - run: sudo pip install mecab-python3
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: python -m pytest -sv ./examples/
             - run: codecov
@@ -42,6 +46,8 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
+            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
+            - run: sudo pip install mecab-python3
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py2_torch:
@@ -55,6 +61,8 @@ jobs:
             - run: sudo pip install torch
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
+            - run: sudo pip install mecab-python
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py2_tf:
@@ -68,6 +76,8 @@ jobs:
             - run: sudo pip install tensorflow
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
+            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
+            - run: sudo pip install mecab-python
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     deploy_doc:

From d2100428d3652cefbffcf0bd00f0881090d26333 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 10 Dec 2019 21:43:49 +0000
Subject: [PATCH 050/110] Update to new test infra and only run conditionally

---
 .circleci/config.yml                          | 20 ++++-----
 .../tests/tokenization_bert_japanese_test.py  |  9 ++--
 transformers/tests/utils.py                   | 42 +++++++++++++------
 3 files changed, 44 insertions(+), 27 deletions(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 97f5f25606..7ca5f8121c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -13,8 +13,6 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
-            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
-            - run: sudo pip install mecab-python3
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py3_torch:
@@ -29,8 +27,6 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
-            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
-            - run: sudo pip install mecab-python3
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: python -m pytest -sv ./examples/
             - run: codecov
@@ -46,8 +42,6 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
             - run: sudo pip install tensorboardX scikit-learn
-            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
-            - run: sudo pip install mecab-python3
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py2_torch:
@@ -61,8 +55,6 @@ jobs:
             - run: sudo pip install torch
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
-            - run: sudo pip install mecab-python
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
     build_py2_tf:
@@ -76,10 +68,18 @@ jobs:
             - run: sudo pip install tensorflow
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest codecov pytest-cov
-            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
-            - run: sudo pip install mecab-python
             - run: python -m pytest -sv ./transformers/tests/ --cov
             - run: codecov
+    build_py3_custom_tokenizers:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:3.5
+        steps:
+            - checkout
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest
+            - run: sudo pip install mecab-python3
+            - run: python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
     deploy_doc:
         working_directory: ~/transformers
         docker:
diff --git a/transformers/tests/tokenization_bert_japanese_test.py b/transformers/tests/tokenization_bert_japanese_test.py
index 6f66b96411..545193c7cc 100644
--- a/transformers/tests/tokenization_bert_japanese_test.py
+++ b/transformers/tests/tokenization_bert_japanese_test.py
@@ -16,7 +16,6 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import os
 import unittest
-import pytest
 from io import open
 
 from transformers.tokenization_bert import WordpieceTokenizer
@@ -25,8 +24,10 @@ from transformers.tokenization_bert_japanese import (BertJapaneseTokenizer,
                                                      VOCAB_FILES_NAMES)
 
 from .tokenization_tests_commons import CommonTestCases
+from .utils import slow, custom_tokenizers
 
 
+@custom_tokenizers
 class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
 
     tokenizer_class = BertJapaneseTokenizer
@@ -104,7 +105,7 @@ class BertJapaneseTokenizationTest(CommonTestCases.CommonTokenizerTester):
         self.assertListEqual(tokenizer.tokenize(u"こんばんは こんばんにちは こんにちは"),
                              [u"こん", u"##ばんは", u"[UNK]", u"こんにちは"])
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese")
 
@@ -172,7 +173,7 @@ class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTeste
         self.assertListEqual(tokenizer.tokenize(u"こんにちほ"),
                              [u"こ", u"ん", u"に", u"ち", u"[UNK]"])
 
-    @pytest.mark.slow
+    @slow
     def test_sequence_builders(self):
         tokenizer = self.tokenizer_class.from_pretrained("bert-base-japanese-char")
 
@@ -188,5 +189,3 @@ class BertJapaneseCharacterTokenizationTest(CommonTestCases.CommonTokenizerTeste
 
 
 
-if __name__ == '__main__':
-    unittest.main()
diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py
index 7a51ab612b..2b97293ca7 100644
--- a/transformers/tests/utils.py
+++ b/transformers/tests/utils.py
@@ -6,18 +6,23 @@ from distutils.util import strtobool
 from transformers.file_utils import _tf_available, _torch_available
 
 
-try:
-    run_slow = os.environ["RUN_SLOW"]
-except KeyError:
-    # RUN_SLOW isn't set, default to skipping slow tests.
-    _run_slow_tests = False
-else:
-    # RUN_SLOW is set, convert it to True or False.
+def parse_flag_from_env(key, default=False):
     try:
-        _run_slow_tests = strtobool(run_slow)
-    except ValueError:
-        # More values are supported, but let's keep the message simple.
-        raise ValueError("If set, RUN_SLOW must be yes or no.")
+        value = os.environ[key]
+    except KeyError:
+        # KEY isn't set, default to `default`.
+        _value = default
+    else:
+        # KEY is set, convert it to True or False.
+        try:
+            _value = strtobool(value)
+        except ValueError:
+            # More values are supported, but let's keep the message simple.
+            raise ValueError("If set, {} must be yes or no.".format(key))
+    return _value
+
+_run_slow_tests = parse_flag_from_env("RUN_SLOW", default=False)
+_run_custom_tokenizers = parse_flag_from_env("RUN_CUSTOM_TOKENIZERS", default=False)
 
 
 def slow(test_case):
@@ -33,6 +38,19 @@ def slow(test_case):
     return test_case
 
 
+def custom_tokenizers(test_case):
+    """
+    Decorator marking a test for a custom tokenizer.
+
+    Custom tokenizers require additional dependencies, and are skipped
+    by default. Set the RUN_CUSTOM_TOKENIZERS environment variable
+    to a truthy value to run them.
+    """
+    if not _run_custom_tokenizers:
+        test_case = unittest.skip("test of custom tokenizers")(test_case)
+    return test_case
+
+
 def require_torch(test_case):
     """
     Decorator marking a test that requires PyTorch.
@@ -59,6 +77,6 @@ def require_tf(test_case):
 
 if _torch_available:
     # Set the USE_CUDA environment variable to select a GPU.
-    torch_device = "cuda" if os.environ.get("USE_CUDA") else "cpu"
+    torch_device = "cuda" if parse_flag_from_env("USE_CUDA") else "cpu"
 else:
     torch_device = None

From 95854c4a2f8d418a14e64b4edf64fc7363b1ff10 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 10 Dec 2019 21:46:00 +0000
Subject: [PATCH 051/110] Actually run the tests

---
 .circleci/config.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 7ca5f8121c..d8f624a0e5 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -79,7 +79,7 @@ jobs:
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest
             - run: sudo pip install mecab-python3
-            - run: python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
+            - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
     deploy_doc:
         working_directory: ~/transformers
         docker:

From 9cb97c0c0f7215971bb5a39cd070e5bd89319bdf Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 10 Dec 2019 21:48:56 +0000
Subject: [PATCH 052/110] Actually run the tests

---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index d8f624a0e5..9d6e02d580 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -101,6 +101,7 @@ workflows:
     version: 2
     build_and_test:
         jobs:
+            - build_py3_custom_tokenizers
             - build_py3_torch_and_tf
             - build_py3_torch
             - build_py3_tf

From 5505cf701477762cedf792e20344d29bc8bf6325 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 10 Dec 2019 21:53:44 +0000
Subject: [PATCH 053/110] Run tests on Py2 too, for Lysandre

---
 .circleci/config.yml | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9d6e02d580..afc6d5ce44 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -80,6 +80,16 @@ jobs:
             - run: sudo pip install pytest
             - run: sudo pip install mecab-python3
             - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
+    build_py2_custom_tokenizers:
+        working_directory: ~/transformers
+        docker:
+            - image: circleci/python:2.7
+        steps:
+            - checkout
+            - run: sudo pip install --progress-bar off .
+            - run: sudo pip install pytest
+            - run: sudo pip install mecab-python
+            - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
     deploy_doc:
         working_directory: ~/transformers
         docker:
@@ -102,6 +112,7 @@ workflows:
     build_and_test:
         jobs:
             - build_py3_custom_tokenizers
+            - build_py2_custom_tokenizers
             - build_py3_torch_and_tf
             - build_py3_torch
             - build_py3_tf

From 371c5ddfad96689771465aff557152322190b60e Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 10 Dec 2019 21:55:43 +0000
Subject: [PATCH 054/110] Py2 tests for Lysandre

---
 .circleci/config.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.circleci/config.yml b/.circleci/config.yml
index afc6d5ce44..c827a81fbb 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -88,6 +88,7 @@ jobs:
             - checkout
             - run: sudo pip install --progress-bar off .
             - run: sudo pip install pytest
+            - run: sudo apt-get -y install libmecab-dev mecab mecab-ipadic-utf8 swig
             - run: sudo pip install mecab-python
             - run: RUN_CUSTOM_TOKENIZERS=1 python -m pytest -sv ./transformers/tests/tokenization_bert_japanese_test.py
     deploy_doc:

From 36fc52a3b4b50885d5ec3bf259f81740e19d8b3c Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 10 Dec 2019 22:03:35 +0000
Subject: [PATCH 055/110] Update links to weights

---
 transformers/configuration_bert.py         | 8 ++++----
 transformers/modeling_bert.py              | 8 ++++----
 transformers/modeling_tf_bert.py           | 8 ++++----
 transformers/tokenization_bert_japanese.py | 8 ++++----
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py
index 16f1f60404..01fcd88cb8 100644
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -42,10 +42,10 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-config.json",
     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-config.json",
     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-config.json",
-    'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-config.json",
-    'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-config.json",
-    'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-config.json",
-    'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-config.json"
+    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
+    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
+    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json"
 }
 
 
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index e2e115a015..d0f35272ac 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -48,10 +48,10 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-pytorch_model.bin",
     'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-pytorch_model.bin",
     'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-pytorch_model.bin",
-    'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-pytorch_model.bin",
-    'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-pytorch_model.bin",
-    'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-pytorch_model.bin",
-    'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-pytorch_model.bin"
+    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
+    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
+    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin"
 }
 
 
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index 27dd311a4d..7cc71f5063 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -48,10 +48,10 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-large-uncased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-whole-word-masking-finetuned-squad-tf_model.h5",
     'bert-large-cased-whole-word-masking-finetuned-squad': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-cased-whole-word-masking-finetuned-squad-tf_model.h5",
     'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-tf_model.h5",
-    'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-tf_model.h5",
-    'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-tf_model.h5",
-    'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-tf_model.h5",
-    'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-tf_model.h5"
+    'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
+    'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
+    'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5"
 }
 
 
diff --git a/transformers/tokenization_bert_japanese.py b/transformers/tokenization_bert_japanese.py
index 1ce0e1d1cb..0ff45cbfe7 100644
--- a/transformers/tokenization_bert_japanese.py
+++ b/transformers/tokenization_bert_japanese.py
@@ -33,10 +33,10 @@ VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}
 PRETRAINED_VOCAB_FILES_MAP = {
     'vocab_file':
     {
-        'bert-base-japanese': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-vocab.txt",
-        'bert-base-japanese-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-whole-word-masking-vocab.txt",
-        'bert-base-japanese-char': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-vocab.txt",
-        'bert-base-japanese-char-whole-word-masking': "https://www.nlp.ecei.tohoku.ac.jp/~m-suzuki/bert-japanese/bert-base-japanese-char-whole-word-masking-vocab.txt"
+        'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-vocab.txt",
+        'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-vocab.txt",
+        'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-vocab.txt",
+        'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-vocab.txt"
     }
 }
 

From 1748fdf657ed804f3edc1e45077b703cd8d6e4c5 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 11 Dec 2019 23:31:23 +0000
Subject: [PATCH 056/110] [doc] Fix rst table

---
 docs/source/pretrained_models.rst | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index d3498e057d..775772e896 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -63,22 +63,22 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | (see `details on dbmdz repository <https://github.com/dbmdz/german-bert>`__).                                                         |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-japanese``                                     | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece.                                                 |
-|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                            |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
+|                   |                                                            | | Trained on Japanese text. Text is tokenized with MeCab and WordPiece.                                                               |
+|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-japanese-whole-word-masking``                  | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece.                        |
-|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                            |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
+|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized with MeCab and WordPiece.                                      |
+|                   |                                                            | | `MeCab <https://taku910.github.io/mecab/>`__ is required for tokenization.                                                          |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-japanese-char``                                | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                          |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
+|                   |                                                            | | Trained on Japanese text. Text is tokenized into characters.                                                                        |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 |                   | ``bert-base-japanese-char-whole-word-masking``             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
-|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                 |
-|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                             |
+|                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                               |
+|                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | OpenAI GPT English model                                                                                                            |

From 413f41921b650418798f7d5c246316c4e1e5eb5d Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 12 Dec 2019 07:34:42 +0100
Subject: [PATCH 057/110] fix merge

---
 transformers/tests/utils.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/transformers/tests/utils.py b/transformers/tests/utils.py
index daed431995..c950ad8f17 100644
--- a/transformers/tests/utils.py
+++ b/transformers/tests/utils.py
@@ -9,14 +9,6 @@ from transformers.file_utils import _tf_available, _torch_available
 SMALL_MODEL_IDENTIFIER = "julien-c/bert-xsmall-dummy"
 
 
-try:
-    run_slow = os.environ["RUN_SLOW"]
-except KeyError:
-    # RUN_SLOW isn't set, default to skipping slow tests.
-    _run_slow_tests = False
-else:
-    # RUN_SLOW is set, convert it to True or False.
-
 def parse_flag_from_env(key, default=False):
     try:
         value = os.environ[key]

From f19dad61c70a628545612e435c699263f02bc4a0 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Thu, 12 Dec 2019 14:46:30 +0100
Subject: [PATCH 058/110] fixing XLM conversion tests with dummy input

---
 transformers/modeling_tf_pytorch_utils.py |  6 +++++-
 transformers/modeling_tf_xlm.py           |  2 +-
 transformers/modeling_xlm.py              | 12 +++++++++++-
 3 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py
index 510e130c90..9d2b663dcb 100644
--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -78,6 +78,7 @@ def load_pytorch_checkpoint_in_tf2_model(tf_model, pytorch_checkpoint_path, tf_i
     logger.info("Loading PyTorch weights from {}".format(pt_path))
 
     pt_state_dict = torch.load(pt_path, map_location='cpu')
+    logger.info("PyTorch checkpoint contains {:,} parameters".format(sum(t.numel() for t in pt_state_dict.values())))
 
     return load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=tf_inputs, allow_missing_keys=allow_missing_keys)
 
@@ -134,7 +135,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
         start_prefix_to_remove = tf_model.base_model_prefix + '.'
 
     symbolic_weights = tf_model.trainable_weights + tf_model.non_trainable_weights
-
+    tf_loaded_numel = 0
     weight_value_tuples = []
     all_pytorch_weights = set(list(pt_state_dict.keys()))
     for symbolic_weight in symbolic_weights:
@@ -159,6 +160,7 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
             e.args += (symbolic_weight.shape, array.shape)
             raise e
 
+        tf_loaded_numel += array.size
         # logger.warning("Initialize TF weight {}".format(symbolic_weight.name))
 
         weight_value_tuples.append((symbolic_weight, array))
@@ -169,6 +171,8 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
     if tf_inputs is not None:
         tfo = tf_model(tf_inputs, training=False)  # Make sure restore ops are run
 
+    logger.info("Loaded {:,} parameters in the TF 2.0 model.".format(tf_loaded_numel))
+
     logger.info("Weights or buffers not loaded from PyTorch model: {}".format(all_pytorch_weights))
 
     return tf_model
diff --git a/transformers/modeling_tf_xlm.py b/transformers/modeling_tf_xlm.py
index 6f11b0537d..903a8596c3 100644
--- a/transformers/modeling_tf_xlm.py
+++ b/transformers/modeling_tf_xlm.py
@@ -460,7 +460,7 @@ class TFXLMPreTrainedModel(TFPreTrainedModel):
             langs_list = tf.constant([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
         else:
             langs_list = None
-        return [inputs_list, attns_list, langs_list]
+        return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
 
 
 XLM_START_DOCSTRING = r"""    The XLM model was proposed in
diff --git a/transformers/modeling_xlm.py b/transformers/modeling_xlm.py
index 257f0da394..b604ae669d 100644
--- a/transformers/modeling_xlm.py
+++ b/transformers/modeling_xlm.py
@@ -227,6 +227,16 @@ class XLMPreTrainedModel(PreTrainedModel):
     def __init__(self, *inputs, **kwargs):
         super(XLMPreTrainedModel, self).__init__(*inputs, **kwargs)
 
+    @property
+    def dummy_inputs(self):
+        inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
+        attns_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        if self.config.use_lang_emb and self.config.n_langs > 1:
+            langs_list = torch.tensor([[1, 1, 0, 0, 1], [1, 1, 1, 0, 0], [1, 0, 0, 1, 1]])
+        else:
+            langs_list = None
+        return {'input_ids': inputs_list, 'attention_mask': attns_list, 'langs': langs_list}
+
     def _init_weights(self, module):
         """ Initialize the weights. """
         if isinstance(module, nn.Embedding):
@@ -646,7 +656,7 @@ class XLMWithLMHeadModel(XLMPreTrainedModel):
                                                langs=langs,
                                                token_type_ids=token_type_ids,
                                                position_ids=position_ids,
-                                               lengths=lengths, 
+                                               lengths=lengths,
                                                cache=cache,
                                                head_mask=head_mask,
                                                inputs_embeds=inputs_embeds)

From fbf5455a8607fa660aacbf06c16f6fe23758b13d Mon Sep 17 00:00:00 2001
From: Alan deLevie <alan@casetext.com>
Date: Wed, 11 Dec 2019 10:14:48 -0500
Subject: [PATCH 059/110] Fix typo in examples/run_glue.py args declaration.

deay -> decay
---
 examples/run_glue.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 369a7110ab..1a51255c11 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -380,7 +380,7 @@ def main():
     parser.add_argument("--learning_rate", default=5e-5, type=float,
                         help="The initial learning rate for Adam.")
     parser.add_argument("--weight_decay", default=0.0, type=float,
-                        help="Weight deay if we apply some.")
+                        help="Weight decay if we apply some.")
     parser.add_argument("--adam_epsilon", default=1e-8, type=float,
                         help="Epsilon for Adam optimizer.")
     parser.add_argument("--max_grad_norm", default=1.0, type=float,

From fe92755b992eb61239ad361abae3b71f86bbbba1 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 12 Dec 2019 11:37:19 -0500
Subject: [PATCH 060/110] Fix special tokens mask in encode

---
 transformers/tokenization_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index f44b77b27c..7e86742286 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -973,7 +973,7 @@ class PreTrainedTokenizer(object):
             token_type_ids = [0] * len(ids) + ([1] * len(pair_ids) if pair else [])
 
         if return_special_tokens_mask:
-            encoded_inputs["special_tokens_mask"] = special_tokens_mask
+            encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
 
         encoded_inputs["input_ids"] = sequence
         if return_token_type_ids:

From 5d67aa21aefaaa62594e8dfb56093b83c5f547bb Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Thu, 12 Dec 2019 12:39:41 -0500
Subject: [PATCH 061/110] [doc] Replicate doc from #2144

---
 transformers/configuration_auto.py       |  1 +
 transformers/configuration_utils.py      |  1 +
 transformers/modeling_auto.py            |  4 ++++
 transformers/modeling_encoder_decoder.py |  2 ++
 transformers/modeling_tf_auto.py         |  4 ++++
 transformers/modeling_tf_utils.py        |  1 +
 transformers/modeling_utils.py           |  1 +
 transformers/tokenization_auto.py        | 11 +++++++++--
 transformers/tokenization_utils.py       |  4 ++--
 9 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py
index 43f251bd0c..fbc5c59199 100644
--- a/transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -83,6 +83,7 @@ class AutoConfig(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
 
diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py
index 8ae30f2a48..82959adb57 100644
--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -79,6 +79,7 @@ class PretrainedConfig(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model configuration to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model configuration that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing a configuration file saved using the :func:`~transformers.PretrainedConfig.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - a path or url to a saved configuration JSON `file`, e.g.: ``./my_model_directory/configuration.json``.
 
diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py
index 6ba1aab7a3..96f45d8ec4 100644
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -93,6 +93,7 @@ class AutoModel(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
@@ -231,6 +232,7 @@ class AutoModelWithLMHead(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
@@ -360,6 +362,7 @@ class AutoModelForSequenceClassification(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
@@ -478,6 +481,7 @@ class AutoModelForQuestionAnswering(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
diff --git a/transformers/modeling_encoder_decoder.py b/transformers/modeling_encoder_decoder.py
index a884abd0a2..70f765b849 100644
--- a/transformers/modeling_encoder_decoder.py
+++ b/transformers/modeling_encoder_decoder.py
@@ -59,12 +59,14 @@ class PreTrainedEncoderDecoder(nn.Module):
             encoder_pretrained_model_name_or_path: information necessary to initiate the encoder. Either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/encoder``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
             decoder_pretrained_model_name_or_path: information necessary to initiate the decoder. Either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/decoder``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
 
diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py
index cfe19ead2a..fac92eb866 100644
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -81,6 +81,7 @@ class TFAutoModel(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
@@ -212,6 +213,7 @@ class TFAutoModelWithLMHead(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
@@ -338,6 +340,7 @@ class TFAutoModelForSequenceClassification(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
@@ -453,6 +456,7 @@ class TFAutoModelForQuestionAnswering(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch, TF 1.X or TF 2.0 checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In the case of a PyTorch checkpoint, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument.
 
diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
index 4a6d18f447..d9a93af21b 100644
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -177,6 +177,7 @@ class TFPreTrainedModel(tf.keras.Model):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `PyTorch state_dict save file` (e.g. `./pt_model/pytorch_model.bin`). In this case, ``from_pt`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the PyTorch checkpoint in a TensorFlow model using the provided conversion scripts and loading the TensorFlow model afterwards.
 
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 37088f8e67..676f355986 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -266,6 +266,7 @@ class PreTrainedModel(nn.Module):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a pre-trained model to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing model weights saved using :func:`~transformers.PreTrainedModel.save_pretrained`, e.g.: ``./my_model_directory/``.
                 - a path or url to a `tensorflow index checkpoint file` (e.g. `./tf_model/model.ckpt.index`). In this case, ``from_tf`` should be set to True and a configuration object should be provided as ``config`` argument. This loading path is slower than converting the TensorFlow checkpoint in a PyTorch model using the provided conversion scripts and loading the PyTorch model afterwards.
                 - None if you are both providing the configuration and state dictionary (resp. with keyword arguments ``config`` and ``state_dict``)
diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py
index f36a584521..1f0599ef7f 100644
--- a/transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -86,6 +86,7 @@ class AutoTokenizer(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
 
@@ -108,8 +109,14 @@ class AutoTokenizer(object):
 
         Examples::
 
-            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')    # Download vocabulary from S3 and cache.
-            tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
+            # Download vocabulary from S3 and cache.
+            tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
+
+            # Download vocabulary from S3 (user-uploaded) and cache.
+            tokenizer = AutoTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
+
+            # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
+            tokenizer = AutoTokenizer.from_pretrained('./test/bert_saved_model/')
 
         """
         if 'distilbert' in pretrained_model_name_or_path:
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 7e86742286..317ecd167b 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -255,7 +255,7 @@ class PreTrainedTokenizer(object):
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
-                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmz/bert-base-german-cased``.
+                - a string with the `identifier name` of a predefined tokenizer that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
                 - a path to a `directory` containing vocabulary files required by the tokenizer, for instance saved using the :func:`~transformers.PreTrainedTokenizer.save_pretrained` method, e.g.: ``./my_model_directory/``.
                 - (not applicable to all derived classes) a path or url to a single saved vocabulary file if and only if the tokenizer only requires a single vocabulary file (e.g. Bert, XLNet), e.g.: ``./my_model_directory/vocab.txt``.
 
@@ -284,7 +284,7 @@ class PreTrainedTokenizer(object):
             tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 
             # Download vocabulary from S3 (user-uploaded) and cache.
-            tokenizer = BertTokenizer.from_pretrained('dbmz/bert-base-german-cased')
+            tokenizer = BertTokenizer.from_pretrained('dbmdz/bert-base-german-cased')
 
             # If vocabulary files are in a directory (e.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`)
             tokenizer = BertTokenizer.from_pretrained('./test/saved_model/')

From 7296f1010b6faaf3b1fb409bc5a9ebadcea51973 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 12 Dec 2019 13:01:04 -0500
Subject: [PATCH 062/110] Cleanup squad and add allow train_file and
 predict_file usage

---
 examples/run_squad.py                 | 22 ++++++++++++++--------
 transformers/data/processors/squad.py |  6 ++++++
 2 files changed, 20 insertions(+), 8 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 79c8537a4b..117b86e32c 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -337,7 +337,7 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
     else:
         logger.info("Creating features from dataset file at %s", input_dir)
 
-        if not args.data_dir:
+        if not args.data_dir and ((evaluate and not args.predict_file) or (not evaluate and not args.train_file)):
             try:
                 import tensorflow_datasets as tfds
             except ImportError:
@@ -350,7 +350,11 @@ def load_and_cache_examples(args, tokenizer, evaluate=False, output_examples=Fal
             examples = SquadV1Processor().get_examples_from_dataset(tfds_examples, evaluate=evaluate)
         else:
             processor = SquadV2Processor() if args.version_2_with_negative else SquadV1Processor()
-            examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
+
+            if evaluate:
+                examples = processor.get_dev_examples(args.data_dir, filename=args.predict_file)
+            else:
+                examples = processor.get_train_examples(args.data_dir, filename=args.train_file)
 
         features, dataset = squad_convert_examples_to_features( 
             examples=examples,
@@ -387,7 +391,14 @@ def main():
 
     ## Other parameters
     parser.add_argument("--data_dir", default=None, type=str,
-                        help="The input data dir. Should contain the .json files for the task. If not specified, will run with tensorflow_datasets.")
+                        help="The input data dir. Should contain the .json files for the task." +
+                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
+    parser.add_argument("--train_file", default=None, type=str,
+                        help="The input training file. If a data dir is specified, will look for the file there" +
+                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
+    parser.add_argument("--predict_file", default=None, type=str,
+                        help="The input evaluation file. If a data dir is specified, will look for the file there" +
+                             "If no data dir or train/predict files are specified, will run with tensorflow_datasets.")
     parser.add_argument("--config_name", default="", type=str,
                         help="Pretrained config name or path if not the same as model_name")
     parser.add_argument("--tokenizer_name", default="", type=str,
@@ -472,11 +483,6 @@ def main():
     parser.add_argument('--server_port', type=str, default='', help="Can be used for distant debugging.")
     args = parser.parse_args()
 
-    args.predict_file = os.path.join(args.output_dir, 'predictions_{}_{}.txt'.format(
-        list(filter(None, args.model_name_or_path.split('/'))).pop(),
-        str(args.max_seq_length))
-    )
-
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir) and args.do_train and not args.overwrite_output_dir:
         raise ValueError("Output directory ({}) already exists and is not empty. Use --overwrite_output_dir to overcome.".format(args.output_dir))
 
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 3d7f832540..9bc4375684 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -373,6 +373,9 @@ class SquadProcessor(DataProcessor):
                 which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
 
         """
+        if data_dir is None:
+            data_dir = ""
+
         if self.train_file is None:
             raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
 
@@ -389,6 +392,9 @@ class SquadProcessor(DataProcessor):
             filename: None by default, specify this if the evaluation file has a different name than the original one
                 which is `train-v1.1.json` and `train-v2.0.json` for squad versions 1.1 and 2.0 respectively.
         """
+        if data_dir is None:
+            data_dir = ""
+
         if self.dev_file is None:
             raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
         

From 33e72b08d54bf5edd192492af7549b581563ecc2 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 13 Dec 2019 11:33:05 +0100
Subject: [PATCH 063/110] fix inner dimensions for 3B/11B models

---
 transformers/modeling_t5.py    | 27 +++++++++++----------------
 transformers/modeling_tf_t5.py | 20 ++++++++------------
 2 files changed, 19 insertions(+), 28 deletions(-)

diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index 149b977abc..c9310179a3 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -30,7 +30,7 @@ from torch import nn
 import torch.nn.functional as F
 from torch.nn import CrossEntropyLoss, MSELoss
 
-from .modeling_utils import PreTrainedModel
+from .modeling_utils import PreTrainedModel, prune_linear_layer
 from .configuration_t5 import T5Config
 from .file_utils import add_start_docstrings, DUMMY_INPUTS, DUMMY_MASK
 
@@ -191,28 +191,26 @@ class T5Attention(nn.Module):
 
         self.output_attentions = config.output_attentions
         self.relative_attention_num_buckets = config.relative_attention_num_buckets
-        self.dim = config.d_model
+        self.d_model = config.d_model
         self.d_kv = config.d_kv
         self.n_heads = config.num_heads
         self.dropout = config.dropout_rate
-        assert self.dim % self.n_heads == 0
-        assert self.dim // self.n_heads == self.d_kv
+        self.inner_dim = self.n_heads * self.d_kv
 
         # Mesh TensorFlow initialization to avoid scaling before softmax
-        self.q = nn.Linear(self.dim, self.dim, bias=False)
-        self.k = nn.Linear(self.dim, self.dim, bias=False)
-        self.v = nn.Linear(self.dim, self.dim, bias=False)
-        self.o = nn.Linear(self.dim, self.dim, bias=False)
+        self.q = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.k = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.v = nn.Linear(self.d_model, self.inner_dim, bias=False)
+        self.o = nn.Linear(self.inner_dim, self.d_model, bias=False)
 
         if self.has_relative_attention_bias:
             self.relative_attention_bias = nn.Embedding(self.relative_attention_num_buckets, self.n_heads)
         self.pruned_heads = set()
 
     def prune_heads(self, heads):
-        attention_head_size = self.dim // self.n_heads
         if len(heads) == 0:
             return
-        mask = torch.ones(self.n_heads, attention_head_size)
+        mask = torch.ones(self.n_heads, self.d_kv)
         heads = set(heads) - self.pruned_heads
         for head in heads:
             head -= sum(1 if h < head else 0 for h in self.pruned_heads)
@@ -226,7 +224,7 @@ class T5Attention(nn.Module):
         self.o = prune_linear_layer(self.o, index, dim=1)
         # Update hyper params
         self.n_heads = self.n_heads - len(heads)
-        self.dim = attention_head_size * self.n_heads
+        self.inner_dim = self.d_kv * self.n_heads
         self.pruned_heads = self.pruned_heads.union(heads)
 
     @staticmethod
@@ -303,17 +301,14 @@ class T5Attention(nn.Module):
             klen = qlen if cache is None else cache['slen'] + qlen
         else:
             klen = kv.size(1)
-        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
-        n_heads = self.n_heads
-        dim_per_head = self.dim // n_heads
 
         def shape(x):
             """  projection """
-            return x.view(bs, -1, self.n_heads, dim_per_head).transpose(1, 2)
+            return x.view(bs, -1, self.n_heads, self.d_kv).transpose(1, 2)
 
         def unshape(x):
             """  compute context """
-            return x.transpose(1, 2).contiguous().view(bs, -1, self.n_heads * dim_per_head)
+            return x.transpose(1, 2).contiguous().view(bs, -1, self.inner_dim)
 
         q = shape(self.q(input))                                          # (bs, n_heads, qlen, dim_per_head)
         if kv is None:
diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
index fd25328ac6..0ae7fff412 100644
--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
@@ -108,17 +108,16 @@ class TFT5Attention(tf.keras.layers.Layer):
 
         self.output_attentions = config.output_attentions
         self.relative_attention_num_buckets = config.relative_attention_num_buckets
-        self.dim = config.d_model
+        self.d_model = config.d_model
         self.d_kv = config.d_kv
         self.n_heads = config.num_heads
-        assert self.dim % self.n_heads == 0
-        assert self.dim // self.n_heads == self.d_kv
+        self.inner_dim = self.n_heads * self.d_kv
 
         # Mesh TensorFlow initialization to avoid scaling before softmax
-        self.q = tf.keras.layers.Dense(self.dim, use_bias=False, name='q')
-        self.k = tf.keras.layers.Dense(self.dim, use_bias=False, name='k')
-        self.v = tf.keras.layers.Dense(self.dim, use_bias=False, name='v')
-        self.o = tf.keras.layers.Dense(self.dim, use_bias=False, name='o')
+        self.q = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='q')
+        self.k = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='k')
+        self.v = tf.keras.layers.Dense(self.inner_dim, use_bias=False, name='v')
+        self.o = tf.keras.layers.Dense(self.d_model, use_bias=False, name='o')
         self.dropout = tf.keras.layers.Dropout(config.dropout_rate)
 
         if self.has_relative_attention_bias:
@@ -199,17 +198,14 @@ class TFT5Attention(tf.keras.layers.Layer):
             klen = qlen if cache is None else cache['slen'] + qlen
         else:
             klen = shape_list(kv)[1]
-        # assert dim == self.dim, 'Dimensions do not match: %s input vs %s configured' % (dim, self.dim)
-        n_heads = self.n_heads
-        dim_per_head = self.dim // n_heads
 
         def shape(x):
             """  projection """
-            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, dim_per_head)), perm=(0, 2, 1, 3))
+            return tf.transpose(tf.reshape(x, (bs, -1, self.n_heads, self.d_kv)), perm=(0, 2, 1, 3))
 
         def unshape(x):
             """  compute context """
-            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.n_heads * dim_per_head))
+            return tf.reshape(tf.transpose(x, perm=(0, 2, 1, 3)), (bs, -1, self.inner_dim))
 
         q = shape(self.q(input))                                          # (bs, n_heads, qlen, dim_per_head)
         if kv is None:

From 47f0e3cfb7df192ab80215cea9096791fce08694 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 13 Dec 2019 14:33:24 +0100
Subject: [PATCH 064/110] cleaning up configuration classes

---
 .../summarization/configuration_bertabs.py    | 10 +--
 .../adding_a_new_model/configuration_xxx.py   | 12 +--
 .../tests/modeling_tf_xxx_test.py             |  2 +-
 .../tests/modeling_xxx_test.py                |  2 +-
 transformers/configuration_albert.py          |  6 +-
 transformers/configuration_bert.py            | 38 +++-----
 transformers/configuration_ctrl.py            | 23 +----
 transformers/configuration_distilbert.py      | 40 ++++-----
 transformers/configuration_gpt2.py            | 55 ++++--------
 transformers/configuration_openai.py          | 57 +++++-------
 transformers/configuration_transfo_xl.py      | 26 ++----
 transformers/configuration_utils.py           | 27 ++++--
 transformers/configuration_xlm.py             | 88 ++++++++-----------
 transformers/configuration_xlnet.py           | 81 +++++++----------
 ..._original_pytorch_checkpoint_to_pytorch.py |  2 +-
 transformers/modeling_gpt2.py                 |  1 +
 transformers/modeling_tf_gpt2.py              |  1 +
 transformers/modeling_tf_transfo_xl.py        |  6 +-
 .../modeling_tf_transfo_xl_utilities.py       | 12 +--
 transformers/modeling_tf_xlnet.py             |  2 +-
 transformers/modeling_transfo_xl.py           | 10 +--
 transformers/modeling_xlnet.py                |  4 +-
 transformers/tests/modeling_albert_test.py    |  2 +-
 transformers/tests/modeling_bert_test.py      |  2 +-
 transformers/tests/modeling_common_test.py    |  2 +-
 transformers/tests/modeling_ctrl_test.py      |  2 +-
 .../tests/modeling_distilbert_test.py         |  2 +-
 transformers/tests/modeling_gpt2_test.py      |  2 +-
 transformers/tests/modeling_openai_test.py    |  2 +-
 transformers/tests/modeling_roberta_test.py   |  2 +-
 transformers/tests/modeling_tf_albert_test.py |  2 +-
 transformers/tests/modeling_tf_bert_test.py   |  2 +-
 transformers/tests/modeling_tf_ctrl_test.py   |  2 +-
 .../tests/modeling_tf_distilbert_test.py      |  2 +-
 transformers/tests/modeling_tf_gpt2_test.py   |  2 +-
 .../tests/modeling_tf_openai_gpt_test.py      |  2 +-
 .../tests/modeling_tf_roberta_test.py         |  2 +-
 .../tests/modeling_tf_transfo_xl_test.py      |  2 +-
 transformers/tests/modeling_tf_xlm_test.py    |  2 +-
 transformers/tests/modeling_tf_xlnet_test.py  |  5 +-
 .../tests/modeling_transfo_xl_test.py         |  2 +-
 transformers/tests/modeling_xlm_test.py       |  2 +-
 transformers/tests/modeling_xlnet_test.py     |  5 +-
 43 files changed, 224 insertions(+), 329 deletions(-)

diff --git a/examples/summarization/configuration_bertabs.py b/examples/summarization/configuration_bertabs.py
index 5bcb65b423..054763ea93 100644
--- a/examples/summarization/configuration_bertabs.py
+++ b/examples/summarization/configuration_bertabs.py
@@ -65,7 +65,7 @@ class BertAbsConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size_or_config_json_file=30522,
+        vocab_size=30522,
         max_pos=512,
         enc_layers=6,
         enc_hidden_size=512,
@@ -81,14 +81,14 @@ class BertAbsConfig(PretrainedConfig):
     ):
         super(BertAbsConfig, self).__init__(**kwargs)
 
-        if self._input_is_path_to_json(vocab_size_or_config_json_file):
-            path_to_json = vocab_size_or_config_json_file
+        if self._input_is_path_to_json(vocab_size):
+            path_to_json = vocab_size
             with open(path_to_json, "r", encoding="utf-8") as reader:
                 json_config = json.loads(reader.read())
             for key, value in json_config.items():
                 self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
+        elif isinstance(vocab_size, int):
+            self.vocab_size = vocab_size
             self.max_pos = max_pos
 
             self.enc_layers = enc_layers
diff --git a/templates/adding_a_new_model/configuration_xxx.py b/templates/adding_a_new_model/configuration_xxx.py
index b1614e71af..ca9e0d554b 100644
--- a/templates/adding_a_new_model/configuration_xxx.py
+++ b/templates/adding_a_new_model/configuration_xxx.py
@@ -39,7 +39,7 @@ class XxxConfig(PretrainedConfig):
 
 
         Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XxxModel`.
+            vocab_size: Vocabulary size of `inputs_ids` in `XxxModel`.
             hidden_size: Size of the encoder layers and the pooler layer.
             num_hidden_layers: Number of hidden layers in the Transformer encoder.
             num_attention_heads: Number of attention heads for each attention layer in
@@ -64,7 +64,7 @@ class XxxConfig(PretrainedConfig):
     pretrained_config_archive_map = XXX_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=50257,
+                 vocab_size=50257,
                  n_positions=1024,
                  n_ctx=1024,
                  n_embd=768,
@@ -84,7 +84,7 @@ class XxxConfig(PretrainedConfig):
                  summary_first_dropout=0.1,
                  **kwargs):
         super(XxxConfig, self).__init__(**kwargs)
-        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, six.string_types) else -1
+        self.vocab_size = vocab_size if isinstance(vocab_size, six.string_types) else -1
         self.n_ctx = n_ctx
         self.n_positions = n_positions
         self.n_embd = n_embd
@@ -102,12 +102,12 @@ class XxxConfig(PretrainedConfig):
         self.summary_activation = summary_activation
         self.summary_first_dropout = summary_first_dropout
         self.summary_proj_to_labels = summary_proj_to_labels
-        if isinstance(vocab_size_or_config_json_file, six.string_types):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
+        if isinstance(vocab_size, six.string_types):
+            with open(vocab_size, "r", encoding="utf-8") as reader:
                 json_config = json.loads(reader.read())
             for key, value in json_config.items():
                 self.__dict__[key] = value
-        elif not isinstance(vocab_size_or_config_json_file, int):
+        elif not isinstance(vocab_size, int):
             raise ValueError(
                 "First argument must be either a vocabulary size (int)"
                 "or the path to a pretrained model config file (str)"
diff --git a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
index d7e576bf8b..912a4aa340 100644
--- a/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_tf_xxx_test.py
@@ -111,7 +111,7 @@ class TFXxxModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = XxxConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
diff --git a/templates/adding_a_new_model/tests/modeling_xxx_test.py b/templates/adding_a_new_model/tests/modeling_xxx_test.py
index bfc70921cd..30e614b3f2 100644
--- a/templates/adding_a_new_model/tests/modeling_xxx_test.py
+++ b/templates/adding_a_new_model/tests/modeling_xxx_test.py
@@ -109,7 +109,7 @@ class XxxModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = XxxConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
diff --git a/transformers/configuration_albert.py b/transformers/configuration_albert.py
index de665c9b1c..6a1ef78dd5 100644
--- a/transformers/configuration_albert.py
+++ b/transformers/configuration_albert.py
@@ -37,7 +37,7 @@ class AlbertConfig(PretrainedConfig):
     pretrained_config_archive_map = ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=30000,
+                 vocab_size=30000,
                  embedding_size=128,
                  hidden_size=4096,
                  num_hidden_layers=12,
@@ -83,7 +83,7 @@ class AlbertConfig(PretrainedConfig):
         """
         super(AlbertConfig, self).__init__(**kwargs)
 
-        self.vocab_size = vocab_size_or_config_json_file
+        self.vocab_size = vocab_size
         self.embedding_size = embedding_size
         self.hidden_size = hidden_size
         self.num_hidden_layers = num_hidden_layers
@@ -97,4 +97,4 @@ class AlbertConfig(PretrainedConfig):
         self.max_position_embeddings = max_position_embeddings
         self.type_vocab_size = type_vocab_size
         self.initializer_range = initializer_range
-        self.layer_norm_eps = layer_norm_eps
\ No newline at end of file
+        self.layer_norm_eps = layer_norm_eps
diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py
index 01fcd88cb8..9072820bce 100644
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -56,7 +56,7 @@ class BertConfig(PretrainedConfig):
 
 
         Arguments:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `BertModel`.
+            vocab_size: Vocabulary size of `inputs_ids` in `BertModel`.
             hidden_size: Size of the encoder layers and the pooler layer.
             num_hidden_layers: Number of hidden layers in the Transformer encoder.
             num_attention_heads: Number of attention heads for each attention layer in
@@ -81,7 +81,7 @@ class BertConfig(PretrainedConfig):
     pretrained_config_archive_map = BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=30522,
+                 vocab_size=30522,
                  hidden_size=768,
                  num_hidden_layers=12,
                  num_attention_heads=12,
@@ -95,25 +95,15 @@ class BertConfig(PretrainedConfig):
                  layer_norm_eps=1e-12,
                  **kwargs):
         super(BertConfig, self).__init__(**kwargs)
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.hidden_size = hidden_size
-            self.num_hidden_layers = num_hidden_layers
-            self.num_attention_heads = num_attention_heads
-            self.hidden_act = hidden_act
-            self.intermediate_size = intermediate_size
-            self.hidden_dropout_prob = hidden_dropout_prob
-            self.attention_probs_dropout_prob = attention_probs_dropout_prob
-            self.max_position_embeddings = max_position_embeddings
-            self.type_vocab_size = type_vocab_size
-            self.initializer_range = initializer_range
-            self.layer_norm_eps = layer_norm_eps
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
diff --git a/transformers/configuration_ctrl.py b/transformers/configuration_ctrl.py
index fcbd848dec..f9b9e409e1 100644
--- a/transformers/configuration_ctrl.py
+++ b/transformers/configuration_ctrl.py
@@ -31,7 +31,7 @@ class CTRLConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `CTRLModel`.
 
     Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
+        vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
         n_positions: Number of positional embeddings.
         n_ctx: Size of the causal mask (usually same as n_positions).
         dff: Size of the inner dimension of the FFN.
@@ -52,7 +52,7 @@ class CTRLConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size_or_config_json_file=246534,
+        vocab_size=246534,
         n_positions=256,
         n_ctx=256,
         n_embd=1280,
@@ -64,8 +64,6 @@ class CTRLConfig(PretrainedConfig):
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-6,
         initializer_range=0.02,
-
-        num_labels=1,
         summary_type='cls_index',
         summary_use_proj=True,
         summary_activation=None,
@@ -76,7 +74,7 @@ class CTRLConfig(PretrainedConfig):
         """Constructs CTRLConfig.
 
         Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
+            vocab_size: Vocabulary size of `inputs_ids` in `CTRLModel` or a configuration json file.
             n_positions: Number of positional embeddings.
             n_ctx: Size of the causal mask (usually same as n_positions).
             dff: Size of the inner dimension of the FFN.
@@ -94,8 +92,7 @@ class CTRLConfig(PretrainedConfig):
                 initializing all weight matrices.
         """
         super(CTRLConfig, self).__init__(**kwargs)
-
-        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
+        self.vocab_size = vocab_size
         self.n_ctx = n_ctx
         self.n_positions = n_positions
         self.n_embd = n_embd
@@ -108,23 +105,11 @@ class CTRLConfig(PretrainedConfig):
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_range = initializer_range
 
-        self.num_labels = num_labels
         self.summary_type = summary_type
         self.summary_use_proj = summary_use_proj
         self.summary_activation = summary_activation
         self.summary_first_dropout = summary_first_dropout
         self.summary_proj_to_labels = summary_proj_to_labels
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif not isinstance(vocab_size_or_config_json_file, int):
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
 
     @property
     def max_position_embeddings(self):
diff --git a/transformers/configuration_distilbert.py b/transformers/configuration_distilbert.py
index d5d575be29..d9f7cc6348 100644
--- a/transformers/configuration_distilbert.py
+++ b/transformers/configuration_distilbert.py
@@ -37,7 +37,7 @@ class DistilBertConfig(PretrainedConfig):
     pretrained_config_archive_map = DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=30522,
+                 vocab_size=30522,
                  max_position_embeddings=512,
                  sinusoidal_pos_embds=False,
                  n_layers=6,
@@ -53,31 +53,21 @@ class DistilBertConfig(PretrainedConfig):
                  seq_classif_dropout=0.2,
                  **kwargs):
         super(DistilBertConfig, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.sinusoidal_pos_embds = sinusoidal_pos_embds
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.dim = dim
+        self.hidden_dim = hidden_dim
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation = activation
+        self.initializer_range = initializer_range
+        self.tie_weights_ = tie_weights_
+        self.qa_dropout = qa_dropout
+        self.seq_classif_dropout = seq_classif_dropout
 
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.max_position_embeddings = max_position_embeddings
-            self.sinusoidal_pos_embds = sinusoidal_pos_embds
-            self.n_layers = n_layers
-            self.n_heads = n_heads
-            self.dim = dim
-            self.hidden_dim = hidden_dim
-            self.dropout = dropout
-            self.attention_dropout = attention_dropout
-            self.activation = activation
-            self.initializer_range = initializer_range
-            self.tie_weights_ = tie_weights_
-            self.qa_dropout = qa_dropout
-            self.seq_classif_dropout = seq_classif_dropout
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
     @property
     def hidden_size(self):
         return self.dim
diff --git a/transformers/configuration_gpt2.py b/transformers/configuration_gpt2.py
index c2fb4948d3..4c200c0760 100644
--- a/transformers/configuration_gpt2.py
+++ b/transformers/configuration_gpt2.py
@@ -36,7 +36,7 @@ class GPT2Config(PretrainedConfig):
     """Configuration class to store the configuration of a `GPT2Model`.
 
     Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+        vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
         n_positions: Number of positional embeddings.
         n_ctx: Size of the causal mask (usually same as n_positions).
         n_embd: Dimensionality of the embeddings and hidden states.
@@ -56,7 +56,7 @@ class GPT2Config(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size_or_config_json_file=50257,
+        vocab_size=50257,
         n_positions=1024,
         n_ctx=1024,
         n_embd=768,
@@ -67,8 +67,6 @@ class GPT2Config(PretrainedConfig):
         attn_pdrop=0.1,
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
-
-        num_labels=1,
         summary_type='cls_index',
         summary_use_proj=True,
         summary_activation=None,
@@ -79,7 +77,7 @@ class GPT2Config(PretrainedConfig):
         """Constructs GPT2Config.
 
         Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
+            vocab_size: Vocabulary size of `inputs_ids` in `GPT2Model` or a configuration json file.
             n_positions: Number of positional embeddings.
             n_ctx: Size of the causal mask (usually same as n_positions).
             n_embd: Dimensionality of the embeddings and hidden states.
@@ -96,37 +94,22 @@ class GPT2Config(PretrainedConfig):
                 initializing all weight matrices.
         """
         super(GPT2Config, self).__init__(**kwargs)
-
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.n_ctx = n_ctx
-            self.n_positions = n_positions
-            self.n_embd = n_embd
-            self.n_layer = n_layer
-            self.n_head = n_head
-            self.resid_pdrop = resid_pdrop
-            self.embd_pdrop = embd_pdrop
-            self.attn_pdrop = attn_pdrop
-            self.layer_norm_epsilon = layer_norm_epsilon
-            self.initializer_range = initializer_range
-
-            self.num_labels = num_labels
-            self.summary_type = summary_type
-            self.summary_use_proj = summary_use_proj
-            self.summary_activation = summary_activation
-            self.summary_first_dropout = summary_first_dropout
-            self.summary_proj_to_labels = summary_proj_to_labels
-        else:
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
+        self.vocab_size = vocab_size
+        self.n_ctx = n_ctx
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
 
     @property
     def max_position_embeddings(self):
diff --git a/transformers/configuration_openai.py b/transformers/configuration_openai.py
index 886b7f5bc5..7776a0bb9f 100644
--- a/transformers/configuration_openai.py
+++ b/transformers/configuration_openai.py
@@ -35,7 +35,7 @@ class OpenAIGPTConfig(PretrainedConfig):
     Configuration class to store the configuration of a `OpenAIGPTModel`.
 
     Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
+        vocab_size: Vocabulary size of `inputs_ids` in `OpenAIGPTModel` or a configuration json file.
         n_positions: Number of positional embeddings.
         n_ctx: Size of the causal mask (usually same as n_positions).
         n_embd: Dimensionality of the embeddings and hidden states.
@@ -58,7 +58,7 @@ class OpenAIGPTConfig(PretrainedConfig):
 
     def __init__(
         self,
-        vocab_size_or_config_json_file=40478,
+        vocab_size=40478,
         n_positions=512,
         n_ctx=512,
         n_embd=768,
@@ -71,8 +71,6 @@ class OpenAIGPTConfig(PretrainedConfig):
         layer_norm_epsilon=1e-5,
         initializer_range=0.02,
         predict_special_tokens=True,
-
-        num_labels=1,
         summary_type='cls_index',
         summary_use_proj=True,
         summary_activation=None,
@@ -83,39 +81,24 @@ class OpenAIGPTConfig(PretrainedConfig):
         """Constructs OpenAIGPTConfig.
         """
         super(OpenAIGPTConfig, self).__init__(**kwargs)
-
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.vocab_size = vocab_size_or_config_json_file
-            self.n_ctx = n_ctx
-            self.n_positions = n_positions
-            self.n_embd = n_embd
-            self.n_layer = n_layer
-            self.n_head = n_head
-            self.afn = afn
-            self.resid_pdrop = resid_pdrop
-            self.embd_pdrop = embd_pdrop
-            self.attn_pdrop = attn_pdrop
-            self.layer_norm_epsilon = layer_norm_epsilon
-            self.initializer_range = initializer_range
-            self.predict_special_tokens = predict_special_tokens
-
-            self.num_labels = num_labels
-            self.summary_type = summary_type
-            self.summary_use_proj = summary_use_proj
-            self.summary_activation = summary_activation
-            self.summary_first_dropout = summary_first_dropout
-            self.summary_proj_to_labels = summary_proj_to_labels
-        else:
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
+        self.vocab_size = vocab_size
+        self.n_ctx = n_ctx
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.afn = afn
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.predict_special_tokens = predict_special_tokens
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_first_dropout = summary_first_dropout
+        self.summary_proj_to_labels = summary_proj_to_labels
 
     @property
     def max_position_embeddings(self):
diff --git a/transformers/configuration_transfo_xl.py b/transformers/configuration_transfo_xl.py
index d55a6adbe6..52f0f45a50 100644
--- a/transformers/configuration_transfo_xl.py
+++ b/transformers/configuration_transfo_xl.py
@@ -34,7 +34,7 @@ class TransfoXLConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `TransfoXLModel`.
 
         Args:
-            vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
+            vocab_size: Vocabulary size of `inputs_ids` in `TransfoXLModel` or a configuration json file.
             cutoffs: cutoffs for the adaptive softmax
             d_model: Dimensionality of the model's hidden states.
             d_embed: Dimensionality of the embeddings
@@ -68,7 +68,7 @@ class TransfoXLConfig(PretrainedConfig):
     pretrained_config_archive_map = TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=267735,
+                 vocab_size=267735,
                  cutoffs=[20000, 40000, 200000],
                  d_model=1024,
                  d_embed=1024,
@@ -100,7 +100,7 @@ class TransfoXLConfig(PretrainedConfig):
         """Constructs TransfoXLConfig.
         """
         super(TransfoXLConfig, self).__init__(**kwargs)
-        self.n_token = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
+        self.vocab_size = vocab_size
         self.cutoffs = []
         self.cutoffs.extend(cutoffs)
         self.tie_weight = tie_weight
@@ -133,27 +133,17 @@ class TransfoXLConfig(PretrainedConfig):
         self.init_std = init_std
         self.layer_norm_epsilon = layer_norm_epsilon
 
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif not isinstance(vocab_size_or_config_json_file, int):
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
-
     @property
     def max_position_embeddings(self):
         return self.tgt_len + self.ext_len + self.mem_len
 
     @property
-    def vocab_size(self):
-        return self.n_token
+    def n_token(self):  # Backward compatibility
+        return self.vocab_size
 
-    @vocab_size.setter
-    def vocab_size(self, value):
-        self.n_token = value
+    @n_token.setter
+    def n_token(self, value):  # Backward compatibility
+        self.vocab_size = value
 
     @property
     def hidden_size(self):
diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py
index 82959adb57..6c9eeea175 100644
--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -49,8 +49,7 @@ class PretrainedConfig(object):
     pretrained_config_archive_map = {}
 
     def __init__(self, **kwargs):
-        self.finetuning_task = kwargs.pop('finetuning_task', None)
-        self.num_labels = kwargs.pop('num_labels', 2)
+        # Attributes with defaults
         self.output_attentions = kwargs.pop('output_attentions', False)
         self.output_hidden_states = kwargs.pop('output_hidden_states', False)
         self.output_past = kwargs.pop('output_past', True)  # Not used by all models
@@ -59,6 +58,22 @@ class PretrainedConfig(object):
         self.pruned_heads = kwargs.pop('pruned_heads', {})
         self.is_decoder = kwargs.pop('is_decoder', False)
 
+        # Fine-tuning task arguments
+        self.finetuning_task = kwargs.pop('finetuning_task', None)
+        self.num_labels = kwargs.pop('num_labels', 2)
+        self.id2label = kwargs.pop('id2label', {i: 'LABEL_{}'.format(i) for i in range(self.num_labels)})
+        self.id2label = dict((int(key), value) for key, value in self.id2label.items())
+        self.label2id = kwargs.pop('label2id', dict(zip(self.id2label.values(), self.id2label.keys())))
+        self.label2id = dict((key, int(value)) for key, value in self.label2id.items())
+
+        # Additional attributes without default values
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error("Can't set {} with value {} for {}".format(key, value, self))
+                raise err
+
     def save_pretrained(self, save_directory):
         """ Save a configuration object to the directory `save_directory`, so that it
             can be re-loaded using the :func:`~transformers.PretrainedConfig.from_pretrained` class method.
@@ -183,17 +198,15 @@ class PretrainedConfig(object):
     @classmethod
     def from_dict(cls, json_object):
         """Constructs a `Config` from a Python dictionary of parameters."""
-        config = cls(vocab_size_or_config_json_file=-1)
-        for key, value in json_object.items():
-            setattr(config, key, value)
-        return config
+        return cls(**json_object)
 
     @classmethod
     def from_json_file(cls, json_file):
         """Constructs a `Config` from a json file of parameters."""
         with open(json_file, "r", encoding='utf-8') as reader:
             text = reader.read()
-        return cls.from_dict(json.loads(text))
+        dict_obj = json.loads(text)
+        return cls(**dict_obj)
 
     def __eq__(self, other):
         return self.__dict__ == other.__dict__
diff --git a/transformers/configuration_xlm.py b/transformers/configuration_xlm.py
index fa3a5f40f6..0740cc4026 100644
--- a/transformers/configuration_xlm.py
+++ b/transformers/configuration_xlm.py
@@ -42,7 +42,7 @@ class XLMConfig(PretrainedConfig):
     """Configuration class to store the configuration of a `XLMModel`.
 
     Args:
-        vocab_size_or_config_json_file: Vocabulary size of `inputs_ids` in `XLMModel`.
+        vocab_size: Vocabulary size of `inputs_ids` in `XLMModel`.
         d_model: Size of the encoder layers and the pooler layer.
         n_layer: Number of hidden layers in the Transformer encoder.
         n_head: Number of attention heads for each attention layer in
@@ -81,7 +81,7 @@ class XLMConfig(PretrainedConfig):
     pretrained_config_archive_map = XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=30145,
+                 vocab_size=30145,
                  emb_dim=2048,
                  n_layers=12,
                  n_heads=16,
@@ -103,9 +103,6 @@ class XLMConfig(PretrainedConfig):
                  unk_index=3,
                  mask_index=5,
                  is_encoder=True,
-
-                 finetuning_task=None,
-                 num_labels=2,
                  summary_type='first',
                  summary_use_proj=True,
                  summary_activation=None,
@@ -117,56 +114,43 @@ class XLMConfig(PretrainedConfig):
         """Constructs XLMConfig.
         """
         super(XLMConfig, self).__init__(**kwargs)
-
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.n_words = vocab_size_or_config_json_file
-            self.emb_dim = emb_dim
-            self.n_layers = n_layers
-            self.n_heads = n_heads
-            self.dropout = dropout
-            self.attention_dropout = attention_dropout
-            self.gelu_activation = gelu_activation
-            self.sinusoidal_embeddings = sinusoidal_embeddings
-            self.causal = causal
-            self.asm = asm
-            self.n_langs = n_langs
-            self.use_lang_emb = use_lang_emb
-            self.layer_norm_eps = layer_norm_eps
-            self.bos_index = bos_index
-            self.eos_index = eos_index
-            self.pad_index = pad_index
-            self.unk_index = unk_index
-            self.mask_index = mask_index
-            self.is_encoder = is_encoder
-            self.max_position_embeddings = max_position_embeddings
-            self.embed_init_std = embed_init_std
-            self.init_std = init_std
-            self.finetuning_task = finetuning_task
-            self.num_labels = num_labels
-            self.summary_type = summary_type
-            self.summary_use_proj = summary_use_proj
-            self.summary_activation = summary_activation
-            self.summary_proj_to_labels = summary_proj_to_labels
-            self.summary_first_dropout = summary_first_dropout
-            self.start_n_top = start_n_top
-            self.end_n_top = end_n_top
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
+        self.vocab_size = vocab_size
+        self.emb_dim = emb_dim
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.gelu_activation = gelu_activation
+        self.sinusoidal_embeddings = sinusoidal_embeddings
+        self.causal = causal
+        self.asm = asm
+        self.n_langs = n_langs
+        self.use_lang_emb = use_lang_emb
+        self.layer_norm_eps = layer_norm_eps
+        self.bos_index = bos_index
+        self.eos_index = eos_index
+        self.pad_index = pad_index
+        self.unk_index = unk_index
+        self.mask_index = mask_index
+        self.is_encoder = is_encoder
+        self.max_position_embeddings = max_position_embeddings
+        self.embed_init_std = embed_init_std
+        self.init_std = init_std
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_proj_to_labels = summary_proj_to_labels
+        self.summary_first_dropout = summary_first_dropout
+        self.start_n_top = start_n_top
+        self.end_n_top = end_n_top
 
     @property
-    def vocab_size(self):
-        return self.n_words
+    def n_words(self):  # For backward compatibility
+        return self.vocab_size
 
-    @vocab_size.setter
-    def vocab_size(self, value):
-        self.n_words = value
+    @n_words.setter
+    def n_words(self, value):  # For backward compatibility
+        self.vocab_size = value
 
     @property
     def hidden_size(self):
diff --git a/transformers/configuration_xlnet.py b/transformers/configuration_xlnet.py
index 0dbf518849..017c57cfd5 100644
--- a/transformers/configuration_xlnet.py
+++ b/transformers/configuration_xlnet.py
@@ -35,7 +35,7 @@ class XLNetConfig(PretrainedConfig):
     """Configuration class to store the configuration of a ``XLNetModel``.
 
     Args:
-        vocab_size_or_config_json_file: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
+        vocab_size: Vocabulary size of ``inputs_ids`` in ``XLNetModel``.
         d_model: Size of the encoder layers and the pooler layer.
         n_layer: Number of hidden layers in the Transformer encoder.
         n_head: Number of attention heads for each attention layer in
@@ -72,28 +72,22 @@ class XLNetConfig(PretrainedConfig):
     pretrained_config_archive_map = XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=32000,
+                 vocab_size=32000,
                  d_model=1024,
                  n_layer=24,
                  n_head=16,
                  d_inner=4096,
-                 max_position_embeddings=512,
                  ff_activation="gelu",
                  untie_r=True,
                  attn_type="bi",
-
                  initializer_range=0.02,
                  layer_norm_eps=1e-12,
-
                  dropout=0.1,
                  mem_len=None,
                  reuse_len=None,
                  bi_data=False,
                  clamp_len=-1,
                  same_length=False,
-
-                 finetuning_task=None,
-                 num_labels=2,
                  summary_type='last',
                  summary_use_proj=True,
                  summary_activation='tanh',
@@ -104,58 +98,45 @@ class XLNetConfig(PretrainedConfig):
         """Constructs XLNetConfig.
         """
         super(XLNetConfig, self).__init__(**kwargs)
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.n_layer = n_layer
+        self.n_head = n_head
+        assert d_model % n_head == 0
+        self.d_head = d_model // n_head
+        self.ff_activation = ff_activation
+        self.d_inner = d_inner
+        self.untie_r = untie_r
+        self.attn_type = attn_type
 
-        if isinstance(vocab_size_or_config_json_file, str) or (sys.version_info[0] == 2
-                        and isinstance(vocab_size_or_config_json_file, unicode)):
-            with open(vocab_size_or_config_json_file, "r", encoding='utf-8') as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                setattr(config, key, value)
-        elif isinstance(vocab_size_or_config_json_file, int):
-            self.n_token = vocab_size_or_config_json_file
-            self.d_model = d_model
-            self.n_layer = n_layer
-            self.n_head = n_head
-            assert d_model % n_head == 0
-            self.d_head = d_model // n_head
-            self.ff_activation = ff_activation
-            self.d_inner = d_inner
-            self.untie_r = untie_r
-            self.attn_type = attn_type
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
 
-            self.initializer_range = initializer_range
-            self.layer_norm_eps = layer_norm_eps
+        self.dropout = dropout
+        self.mem_len = mem_len
+        self.reuse_len = reuse_len
+        self.bi_data = bi_data
+        self.clamp_len = clamp_len
+        self.same_length = same_length
 
-            self.dropout = dropout
-            self.mem_len = mem_len
-            self.reuse_len = reuse_len
-            self.bi_data = bi_data
-            self.clamp_len = clamp_len
-            self.same_length = same_length
-
-            self.finetuning_task = finetuning_task
-            self.num_labels = num_labels
-            self.summary_type = summary_type
-            self.summary_use_proj = summary_use_proj
-            self.summary_activation = summary_activation
-            self.summary_last_dropout = summary_last_dropout
-            self.start_n_top = start_n_top
-            self.end_n_top = end_n_top
-        else:
-            raise ValueError("First argument must be either a vocabulary size (int)"
-                             " or the path to a pretrained model config file (str)")
+        self.summary_type = summary_type
+        self.summary_use_proj = summary_use_proj
+        self.summary_activation = summary_activation
+        self.summary_last_dropout = summary_last_dropout
+        self.start_n_top = start_n_top
+        self.end_n_top = end_n_top
 
     @property
     def max_position_embeddings(self):
         return -1
 
     @property
-    def vocab_size(self):
-        return self.n_token
+    def n_token(self):  # Backward compatibility
+        return self.vocab_size
 
-    @vocab_size.setter
-    def vocab_size(self, value):
-        self.n_token = value
+    @n_token.setter
+    def n_token(self, value):  # Backward compatibility
+        self.vocab_size = value
 
     @property
     def hidden_size(self):
diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index 60935add60..b4dc1bb61b 100644
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -46,7 +46,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
     roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
     roberta.eval()  # disable dropout
     config = BertConfig(
-        vocab_size_or_config_json_file=50265,
+        vocab_size=50265,
         hidden_size=roberta.args.encoder_embed_dim,
         num_hidden_layers=roberta.args.encoder_layers,
         num_attention_heads=roberta.args.encoder_attention_heads,
diff --git a/transformers/modeling_gpt2.py b/transformers/modeling_gpt2.py
index 96fd1c0607..ea660262d7 100644
--- a/transformers/modeling_gpt2.py
+++ b/transformers/modeling_gpt2.py
@@ -634,6 +634,7 @@ class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     """
     def __init__(self, config):
         super(GPT2DoubleHeadsModel, self).__init__(config)
+        config.num_labels = 1
         self.transformer = GPT2Model(config)
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.multiple_choice_head = SequenceSummary(config)
diff --git a/transformers/modeling_tf_gpt2.py b/transformers/modeling_tf_gpt2.py
index c738e5e8e3..973473179f 100644
--- a/transformers/modeling_tf_gpt2.py
+++ b/transformers/modeling_tf_gpt2.py
@@ -574,6 +574,7 @@ class TFGPT2DoubleHeadsModel(TFGPT2PreTrainedModel):
     """
     def __init__(self, config, *inputs, **kwargs):
         super(TFGPT2DoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
+        config.num_labels = 1
         self.transformer = TFGPT2MainLayer(config, name='transformer')
         self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
 
diff --git a/transformers/modeling_tf_transfo_xl.py b/transformers/modeling_tf_transfo_xl.py
index fd325e218e..848edfa37a 100644
--- a/transformers/modeling_tf_transfo_xl.py
+++ b/transformers/modeling_tf_transfo_xl.py
@@ -353,7 +353,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
 
-        self.n_token = config.n_token
+        self.n_token = config.vocab_size
 
         self.d_embed = config.d_embed
         self.d_model = config.d_model
@@ -361,7 +361,7 @@ class TFTransfoXLMainLayer(tf.keras.layers.Layer):
         self.d_head = config.d_head
         self.untie_r = config.untie_r
 
-        self.word_emb = TFAdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs, 
+        self.word_emb = TFAdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs, 
                                             div_val=config.div_val, init_std=config.init_std, name='word_emb')
 
         self.drop = tf.keras.layers.Dropout(config.dropout)
@@ -729,7 +729,7 @@ class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
             raise NotImplementedError
         # use adaptive softmax (including standard softmax)
         else:
-            self.crit = TFAdaptiveSoftmaxMask(config.n_token, config.d_embed, config.d_model, 
+            self.crit = TFAdaptiveSoftmaxMask(config.vocab_size, config.d_embed, config.d_model, 
                                               config.cutoffs, div_val=config.div_val, name='crit')
 
     def reset_length(self, tgt_len, ext_len, mem_len):
diff --git a/transformers/modeling_tf_transfo_xl_utilities.py b/transformers/modeling_tf_transfo_xl_utilities.py
index e6a6dfe686..f730af851f 100644
--- a/transformers/modeling_tf_transfo_xl_utilities.py
+++ b/transformers/modeling_tf_transfo_xl_utilities.py
@@ -25,15 +25,15 @@ import tensorflow as tf
 from .modeling_tf_utils import shape_list
 
 class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
-    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1,
+    def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1,
                  keep_order=False, **kwargs):
         super(TFAdaptiveSoftmaxMask, self).__init__(**kwargs)
 
-        self.n_token = n_token
+        self.vocab_size = vocab_size
         self.d_embed = d_embed
         self.d_proj = d_proj
 
-        self.cutoffs = cutoffs + [n_token]
+        self.cutoffs = cutoffs + [vocab_size]
         self.cutoff_ends = [0] + self.cutoffs
         self.div_val = div_val
 
@@ -66,11 +66,11 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
                     self.out_projs.append(weight)
                 else:
                     self.out_projs.append(None)
-                weight = self.add_weight(shape=(self.n_token, self.d_embed,),
+                weight = self.add_weight(shape=(self.vocab_size, self.d_embed,),
                                          initializer='zeros',
                                          trainable=True,
                                          name='out_layers_._{}_._weight'.format(i))
-                bias = self.add_weight(shape=(self.n_token,),
+                bias = self.add_weight(shape=(self.vocab_size,),
                                          initializer='zeros',
                                          trainable=True,
                                          name='out_layers_._{}_._bias'.format(i))
@@ -114,7 +114,7 @@ class TFAdaptiveSoftmaxMask(tf.keras.layers.Layer):
         hidden, target = inputs
         head_logprob = 0
         if self.n_clusters == 0:
-            softmax_b = tf.get_variable('bias', [n_token], initializer=tf.zeros_initializer())
+            softmax_b = tf.get_variable('bias', [self.config.vocab_size], initializer=tf.zeros_initializer())
             output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
             if target is not None:
                 loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
diff --git a/transformers/modeling_tf_xlnet.py b/transformers/modeling_tf_xlnet.py
index 759b57d835..dde2b6a8df 100644
--- a/transformers/modeling_tf_xlnet.py
+++ b/transformers/modeling_tf_xlnet.py
@@ -366,7 +366,7 @@ class TFXLNetMainLayer(tf.keras.layers.Layer):
         self.use_bfloat16 = config.use_bfloat16
         self.initializer_range = config.initializer_range
 
-        self.word_embedding = TFSharedEmbeddings(config.n_token, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
+        self.word_embedding = TFSharedEmbeddings(config.vocab_size, config.d_model, initializer_range=config.initializer_range, name='word_embedding')
         self.layer = [TFXLNetLayer(config, name='layer_._{}'.format(i)) for i in range(config.n_layer)]
         self.dropout = tf.keras.layers.Dropout(config.dropout)
 
diff --git a/transformers/modeling_transfo_xl.py b/transformers/modeling_transfo_xl.py
index a6a82f0dfe..f87d857a7f 100644
--- a/transformers/modeling_transfo_xl.py
+++ b/transformers/modeling_transfo_xl.py
@@ -592,14 +592,14 @@ class TransfoXLModel(TransfoXLPreTrainedModel):
         self.output_attentions = config.output_attentions
         self.output_hidden_states = config.output_hidden_states
 
-        self.n_token = config.n_token
+        self.n_token = config.vocab_size
 
         self.d_embed = config.d_embed
         self.d_model = config.d_model
         self.n_head = config.n_head
         self.d_head = config.d_head
 
-        self.word_emb = AdaptiveEmbedding(config.n_token, config.d_embed, config.d_model, config.cutoffs,
+        self.word_emb = AdaptiveEmbedding(config.vocab_size, config.d_embed, config.d_model, config.cutoffs,
                                           div_val=config.div_val)
 
         self.drop = nn.Dropout(config.dropout)
@@ -836,11 +836,11 @@ class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
         self.sample_softmax = config.sample_softmax
         # use sampled softmax
         if config.sample_softmax > 0:
-            self.out_layer = nn.Linear(config.d_model, config.n_token)
-            self.sampler = LogUniformSampler(config.n_token, config.sample_softmax)
+            self.out_layer = nn.Linear(config.d_model, config.vocab_size)
+            self.sampler = LogUniformSampler(config.vocab_size, config.sample_softmax)
         # use adaptive softmax (including standard softmax)
         else:
-            self.crit = ProjectedAdaptiveLogSoftmax(config.n_token, config.d_embed, config.d_model,
+            self.crit = ProjectedAdaptiveLogSoftmax(config.vocab_size, config.d_embed, config.d_model,
                                                     config.cutoffs, div_val=config.div_val)
         self.init_weights()
 
diff --git a/transformers/modeling_xlnet.py b/transformers/modeling_xlnet.py
index 225e5b059b..daed5f2857 100644
--- a/transformers/modeling_xlnet.py
+++ b/transformers/modeling_xlnet.py
@@ -609,7 +609,7 @@ class XLNetModel(XLNetPreTrainedModel):
         self.clamp_len = config.clamp_len
         self.n_layer = config.n_layer
 
-        self.word_embedding = nn.Embedding(config.n_token, config.d_model)
+        self.word_embedding = nn.Embedding(config.vocab_size, config.d_model)
         self.mask_emb = nn.Parameter(torch.FloatTensor(1, 1, config.d_model))
         self.layer = nn.ModuleList([XLNetLayer(config) for _ in range(config.n_layer)])
         self.dropout = nn.Dropout(config.dropout)
@@ -940,7 +940,7 @@ class XLNetLMHeadModel(XLNetPreTrainedModel):
         self.same_length = config.same_length
 
         self.transformer = XLNetModel(config)
-        self.lm_loss = nn.Linear(config.d_model, config.n_token, bias=True)
+        self.lm_loss = nn.Linear(config.d_model, config.vocab_size, bias=True)
 
         self.init_weights()
 
diff --git a/transformers/tests/modeling_albert_test.py b/transformers/tests/modeling_albert_test.py
index a14d66ae8f..1911d244e7 100644
--- a/transformers/tests/modeling_albert_test.py
+++ b/transformers/tests/modeling_albert_test.py
@@ -110,7 +110,7 @@ class AlbertModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = AlbertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
diff --git a/transformers/tests/modeling_bert_test.py b/transformers/tests/modeling_bert_test.py
index 539f66cd3f..0eb7bc9a14 100644
--- a/transformers/tests/modeling_bert_test.py
+++ b/transformers/tests/modeling_bert_test.py
@@ -109,7 +109,7 @@ class BertModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = BertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index 80d5d95455..f86eb7a3d0 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -633,7 +633,7 @@ class CommonTestCases:
                 mc_token_ids = ids_tensor([self.batch_size, self.n_choices], self.seq_length)
 
             config = self.config_class(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_positions=self.n_positions,
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
diff --git a/transformers/tests/modeling_ctrl_test.py b/transformers/tests/modeling_ctrl_test.py
index 8c14578a5c..c7de49b2ab 100644
--- a/transformers/tests/modeling_ctrl_test.py
+++ b/transformers/tests/modeling_ctrl_test.py
@@ -114,7 +114,7 @@ class CTRLModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = CTRLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
                 n_head=self.num_attention_heads,
diff --git a/transformers/tests/modeling_distilbert_test.py b/transformers/tests/modeling_distilbert_test.py
index 4b8f64327d..82f71c40da 100644
--- a/transformers/tests/modeling_distilbert_test.py
+++ b/transformers/tests/modeling_distilbert_test.py
@@ -105,7 +105,7 @@ class DistilBertModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = DistilBertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 dim=self.hidden_size,
                 n_layers=self.num_hidden_layers,
                 n_heads=self.num_attention_heads,
diff --git a/transformers/tests/modeling_gpt2_test.py b/transformers/tests/modeling_gpt2_test.py
index ecaa2a4bd0..a82e39c261 100644
--- a/transformers/tests/modeling_gpt2_test.py
+++ b/transformers/tests/modeling_gpt2_test.py
@@ -110,7 +110,7 @@ class GPT2ModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = GPT2Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
                 n_head=self.num_attention_heads,
diff --git a/transformers/tests/modeling_openai_test.py b/transformers/tests/modeling_openai_test.py
index 8e4d13438d..7655e432e8 100644
--- a/transformers/tests/modeling_openai_test.py
+++ b/transformers/tests/modeling_openai_test.py
@@ -98,7 +98,7 @@ class OpenAIGPTModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = OpenAIGPTConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
                 n_head=self.num_attention_heads,
diff --git a/transformers/tests/modeling_roberta_test.py b/transformers/tests/modeling_roberta_test.py
index 7a3553b164..4d34a50528 100644
--- a/transformers/tests/modeling_roberta_test.py
+++ b/transformers/tests/modeling_roberta_test.py
@@ -106,7 +106,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = RobertaConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
diff --git a/transformers/tests/modeling_tf_albert_test.py b/transformers/tests/modeling_tf_albert_test.py
index 7d3325b70b..93aeab66c2 100644
--- a/transformers/tests/modeling_tf_albert_test.py
+++ b/transformers/tests/modeling_tf_albert_test.py
@@ -118,7 +118,7 @@ class TFAlbertModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = AlbertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
diff --git a/transformers/tests/modeling_tf_bert_test.py b/transformers/tests/modeling_tf_bert_test.py
index d7a86fecb9..20073e1ab8 100644
--- a/transformers/tests/modeling_tf_bert_test.py
+++ b/transformers/tests/modeling_tf_bert_test.py
@@ -114,7 +114,7 @@ class TFBertModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = BertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
diff --git a/transformers/tests/modeling_tf_ctrl_test.py b/transformers/tests/modeling_tf_ctrl_test.py
index 0b421c20c9..0876582e57 100644
--- a/transformers/tests/modeling_tf_ctrl_test.py
+++ b/transformers/tests/modeling_tf_ctrl_test.py
@@ -112,7 +112,7 @@ class TFCTRLModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = CTRLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
                 n_head=self.num_attention_heads,
diff --git a/transformers/tests/modeling_tf_distilbert_test.py b/transformers/tests/modeling_tf_distilbert_test.py
index 0ec45150ca..d9e971c2a5 100644
--- a/transformers/tests/modeling_tf_distilbert_test.py
+++ b/transformers/tests/modeling_tf_distilbert_test.py
@@ -107,7 +107,7 @@ class TFDistilBertModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = DistilBertConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 dim=self.hidden_size,
                 n_layers=self.num_hidden_layers,
                 n_heads=self.num_attention_heads,
diff --git a/transformers/tests/modeling_tf_gpt2_test.py b/transformers/tests/modeling_tf_gpt2_test.py
index e070b72e65..3f30b32787 100644
--- a/transformers/tests/modeling_tf_gpt2_test.py
+++ b/transformers/tests/modeling_tf_gpt2_test.py
@@ -115,7 +115,7 @@ class TFGPT2ModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = GPT2Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
                 n_head=self.num_attention_heads,
diff --git a/transformers/tests/modeling_tf_openai_gpt_test.py b/transformers/tests/modeling_tf_openai_gpt_test.py
index 675e806c12..863dbf1bc0 100644
--- a/transformers/tests/modeling_tf_openai_gpt_test.py
+++ b/transformers/tests/modeling_tf_openai_gpt_test.py
@@ -114,7 +114,7 @@ class TFOpenAIGPTModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = OpenAIGPTConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_embd=self.hidden_size,
                 n_layer=self.num_hidden_layers,
                 n_head=self.num_attention_heads,
diff --git a/transformers/tests/modeling_tf_roberta_test.py b/transformers/tests/modeling_tf_roberta_test.py
index 42440bf1b7..f4ed97c44b 100644
--- a/transformers/tests/modeling_tf_roberta_test.py
+++ b/transformers/tests/modeling_tf_roberta_test.py
@@ -109,7 +109,7 @@ class TFRobertaModelTest(TFCommonTestCases.TFCommonModelTester):
                 choice_labels = ids_tensor([self.batch_size], self.num_choices)
 
             config = RobertaConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 hidden_size=self.hidden_size,
                 num_hidden_layers=self.num_hidden_layers,
                 num_attention_heads=self.num_attention_heads,
diff --git a/transformers/tests/modeling_tf_transfo_xl_test.py b/transformers/tests/modeling_tf_transfo_xl_test.py
index 03e332bdc1..553263250a 100644
--- a/transformers/tests/modeling_tf_transfo_xl_test.py
+++ b/transformers/tests/modeling_tf_transfo_xl_test.py
@@ -92,7 +92,7 @@ class TFTransfoXLModelTest(TFCommonTestCases.TFCommonModelTester):
                 lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             config = TransfoXLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 mem_len=self.mem_len,
                 clamp_len=self.clamp_len,
                 cutoffs=self.cutoffs,
diff --git a/transformers/tests/modeling_tf_xlm_test.py b/transformers/tests/modeling_tf_xlm_test.py
index a680b70367..228e436149 100644
--- a/transformers/tests/modeling_tf_xlm_test.py
+++ b/transformers/tests/modeling_tf_xlm_test.py
@@ -125,7 +125,7 @@ class TFXLMModelTest(TFCommonTestCases.TFCommonModelTester):
                 is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
 
             config = XLMConfig(
-                 vocab_size_or_config_json_file=self.vocab_size,
+                 vocab_size=self.vocab_size,
                  n_special=self.n_special,
                  emb_dim=self.hidden_size,
                  n_layers=self.num_hidden_layers,
diff --git a/transformers/tests/modeling_tf_xlnet_test.py b/transformers/tests/modeling_tf_xlnet_test.py
index 94864b86f2..eb66d92793 100644
--- a/transformers/tests/modeling_tf_xlnet_test.py
+++ b/transformers/tests/modeling_tf_xlnet_test.py
@@ -64,7 +64,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                      num_attention_heads=4,
                      d_inner=128,
                      num_hidden_layers=5,
-                     max_position_embeddings=10,
                      type_sequence_label_size=2,
                      untie_r=True,
                      bi_data=False,
@@ -88,7 +87,6 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
             self.num_attention_heads = num_attention_heads
             self.d_inner = d_inner
             self.num_hidden_layers = num_hidden_layers
-            self.max_position_embeddings = max_position_embeddings
             self.bi_data = bi_data
             self.untie_r = untie_r
             self.same_length = same_length
@@ -122,13 +120,12 @@ class TFXLNetModelTest(TFCommonTestCases.TFCommonModelTester):
                 is_impossible_labels = ids_tensor([self.batch_size], 2, dtype=tf.float32)
 
             config = XLNetConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 d_model=self.hidden_size,
                 n_head=self.num_attention_heads,
                 d_inner=self.d_inner,
                 n_layer=self.num_hidden_layers,
                 untie_r=self.untie_r,
-                max_position_embeddings=self.max_position_embeddings,
                 mem_len=self.mem_len,
                 clamp_len=self.clamp_len,
                 same_length=self.same_length,
diff --git a/transformers/tests/modeling_transfo_xl_test.py b/transformers/tests/modeling_transfo_xl_test.py
index 647dd3724d..dca46444ba 100644
--- a/transformers/tests/modeling_transfo_xl_test.py
+++ b/transformers/tests/modeling_transfo_xl_test.py
@@ -91,7 +91,7 @@ class TransfoXLModelTest(CommonTestCases.CommonModelTester):
                 lm_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             config = TransfoXLConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 mem_len=self.mem_len,
                 clamp_len=self.clamp_len,
                 cutoffs=self.cutoffs,
diff --git a/transformers/tests/modeling_xlm_test.py b/transformers/tests/modeling_xlm_test.py
index f6b980767c..7cae6c848e 100644
--- a/transformers/tests/modeling_xlm_test.py
+++ b/transformers/tests/modeling_xlm_test.py
@@ -121,7 +121,7 @@ class XLMModelTest(CommonTestCases.CommonModelTester):
                 is_impossible_labels = ids_tensor([self.batch_size], 2).float()
 
             config = XLMConfig(
-                 vocab_size_or_config_json_file=self.vocab_size,
+                 vocab_size=self.vocab_size,
                  n_special=self.n_special,
                  emb_dim=self.hidden_size,
                  n_layers=self.num_hidden_layers,
diff --git a/transformers/tests/modeling_xlnet_test.py b/transformers/tests/modeling_xlnet_test.py
index 56b6bb3f4d..6d901ee699 100644
--- a/transformers/tests/modeling_xlnet_test.py
+++ b/transformers/tests/modeling_xlnet_test.py
@@ -60,7 +60,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                      num_attention_heads=4,
                      d_inner=128,
                      num_hidden_layers=5,
-                     max_position_embeddings=10,
                      type_sequence_label_size=2,
                      untie_r=True,
                      bi_data=False,
@@ -84,7 +83,6 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
             self.num_attention_heads = num_attention_heads
             self.d_inner = d_inner
             self.num_hidden_layers = num_hidden_layers
-            self.max_position_embeddings = max_position_embeddings
             self.bi_data = bi_data
             self.untie_r = untie_r
             self.same_length = same_length
@@ -116,13 +114,12 @@ class XLNetModelTest(CommonTestCases.CommonModelTester):
                 token_labels = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
 
             config = XLNetConfig(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 d_model=self.hidden_size,
                 n_head=self.num_attention_heads,
                 d_inner=self.d_inner,
                 n_layer=self.num_hidden_layers,
                 untie_r=self.untie_r,
-                max_position_embeddings=self.max_position_embeddings,
                 mem_len=self.mem_len,
                 clamp_len=self.clamp_len,
                 same_length=self.same_length,

From 8ade2040984c2cd3fd04bf56b133f70718254b03 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 13 Dec 2019 14:48:47 +0100
Subject: [PATCH 065/110] fix tf

---
 transformers/modeling_openai.py    | 1 +
 transformers/modeling_tf_openai.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/transformers/modeling_openai.py b/transformers/modeling_openai.py
index 4fe7ffee8b..72f1224e39 100644
--- a/transformers/modeling_openai.py
+++ b/transformers/modeling_openai.py
@@ -590,6 +590,7 @@ class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     def __init__(self, config):
         super(OpenAIGPTDoubleHeadsModel, self).__init__(config)
 
+        config.num_labels = 1
         self.transformer = OpenAIGPTModel(config)
         self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
         self.multiple_choice_head = SequenceSummary(config)
diff --git a/transformers/modeling_tf_openai.py b/transformers/modeling_tf_openai.py
index dac3b17590..bd469f0205 100644
--- a/transformers/modeling_tf_openai.py
+++ b/transformers/modeling_tf_openai.py
@@ -538,6 +538,7 @@ class TFOpenAIGPTDoubleHeadsModel(TFOpenAIGPTPreTrainedModel):
     """
     def __init__(self, config, *inputs, **kwargs):
         super(TFOpenAIGPTDoubleHeadsModel, self).__init__(config, *inputs, **kwargs)
+        config.num_labels = 1
         self.transformer = TFOpenAIGPTMainLayer(config, name='transformer')
         self.multiple_choice_head = TFSequenceSummary(config, initializer_range=config.initializer_range, name='multiple_choice_head')
 

From 5a5c4349e8a141d2c0915d71cb3cee101da0db6f Mon Sep 17 00:00:00 2001
From: Pierric Cistac <Pierrci@users.noreply.github.com>
Date: Fri, 13 Dec 2019 10:02:33 -0500
Subject: [PATCH 066/110] Fix summarization `to_cpu` doc

---
 examples/summarization/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/summarization/README.md b/examples/summarization/README.md
index 96825cfa46..b98581e8e5 100644
--- a/examples/summarization/README.md
+++ b/examples/summarization/README.md
@@ -29,7 +29,7 @@ And move all the stories to the same folder. We will refer as `$DATA_PATH` the p
 python run_summarization.py \
     --documents_dir $DATA_PATH \
     --summaries_output_dir $SUMMARIES_PATH \ # optional
-    --to_cpu false \
+    --no_cuda false \
     --batch_size 4 \
     --min_length 50 \
     --max_length 200 \
@@ -39,7 +39,7 @@ python run_summarization.py \
     --compute_rouge true
 ```
 
-The scripts executes on GPU if one is available and if `to_cpu` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
+The scripts executes on GPU if one is available and if `no_cuda` is not set to `true`. Inference on multiple GPUs is not suported yet. The ROUGE scores will be displayed in the console at the end of evaluation and written in a `rouge_scores.txt` file. The script takes 30 hours to compute with a single Tesla V100 GPU and a batch size of 10 (300,000 texts to summarize).
 
 ## Summarize any text
 
@@ -49,7 +49,7 @@ Put the documents that you would like to summarize in a folder (the path to whic
 python run_summarization.py \
     --documents_dir $DATA_PATH \
     --summaries_output_dir $SUMMARIES_PATH \ # optional
-    --to_cpu false \
+    --no_cuda false \
     --batch_size 4 \
     --min_length 50 \
     --max_length 200 \

From 5c00e344c1350e079d428a4d69cbb465ca7ffde9 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Fri, 13 Dec 2019 16:33:29 +0100
Subject: [PATCH 067/110] update model doc - swith 3B/11B to 3b/11b

---
 docs/source/pretrained_models.rst | 25 ++++++++++---------------
 transformers/configuration_t5.py  |  4 ++--
 transformers/modeling_t5.py       |  4 ++--
 transformers/modeling_tf_t5.py    |  4 ++--
 transformers/tokenization_t5.py   |  8 ++++----
 5 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index 7e1366b53a..c6b990f213 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -217,25 +217,20 @@ Here is the full list of the currently provided pretrained models together with
 |                   |                                                            | | ALBERT xxlarge model with no dropout, additional training data and longer training                                                  |
 |                   |                                                            | (see `details <https://github.com/google-research/ALBERT>`__)                                                                         |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-| T5                | ``t5-small``                                               | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
-|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint                                                   |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+| T5                | ``t5-small``                                               | | ~60M parameters with 6-layers, 512-hidden-state, 2048 feed-forward hidden-state, 8-heads,                                           |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-base``                                                | | 6-layer, 768-hidden, 12-heads, 66M parameters                                                                                       |
-|                   |                                                            | | The DistilBERT model distilled from the BERT model `bert-base-uncased` checkpoint, with an additional linear layer.                 |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   | ``t5-base``                                                | | ~220M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 12-heads,                                        |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-large``                                               | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
-|                   |                                                            | | The DistilGPT2 model distilled from the GPT2 model `gpt2` checkpoint.                                                               |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   | ``t5-large``                                               | | ~770M parameters with 24-layers, 1024-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-3b``                                                  | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
-|                   |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   | ``t5-3B``                                                  | | ~2.8B parameters with 24-layers, 1024-hidden-state, 16384 feed-forward hidden-state, 32-heads,                                      |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 |                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
-|                   | ``t5-11b``                                                 | | 6-layer, 768-hidden, 12-heads, 82M parameters                                                                                       |
-|                   |                                                            | | The DistilRoBERTa model distilled from the RoBERTa model `roberta-base` checkpoint.                                                 |
-|                   |                                                            | (see `details <https://github.com/huggingface/transformers/tree/master/examples/distillation>`__)                                     |
+|                   | ``t5-11B``                                                 | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads,                                      |
+|                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 
 
diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py
index 2ccdebc2b1..6391cb4180 100644
--- a/transformers/configuration_t5.py
+++ b/transformers/configuration_t5.py
@@ -30,8 +30,8 @@ T5_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-config.json",
     't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-config.json",
     't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-config.json",
-    't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-config.json",
-    't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-config.json",
+    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-config.json",
+    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-config.json",
 }
 
 
diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index c9310179a3..263dc33b70 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -44,8 +44,8 @@ T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
     't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-pytorch_model.bin",
     't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-pytorch_model.bin",
     't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-pytorch_model.bin",
-    't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-pytorch_model.bin",
-    't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-pytorch_model.bin",
+    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-pytorch_model.bin",
+    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-pytorch_model.bin",
 }
 
 ####################################################
diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
index 0ae7fff412..1336a1c30d 100644
--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
@@ -34,8 +34,8 @@ TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP = {
     't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-small-tf_model.h5",
     't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-base-tf_model.h5",
     't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-large-tf_model.h5",
-    't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3B-tf_model.h5",
-    't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11B-tf_model.h5",
+    't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-3b-tf_model.h5",
+    't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-11b-tf_model.h5",
 }
 
 ####################################################
diff --git a/transformers/tokenization_t5.py b/transformers/tokenization_t5.py
index 62e9c069e2..9fd37b67c0 100644
--- a/transformers/tokenization_t5.py
+++ b/transformers/tokenization_t5.py
@@ -44,8 +44,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
         't5-small': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
         't5-base': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
         't5-large': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        't5-3B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
-        't5-11B': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-3b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
+        't5-11b': "https://s3.amazonaws.com/models.huggingface.co/bert/t5-spiece.model",
     }
 }
 
@@ -56,8 +56,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     't5-small': 512,
     't5-base': 512,
     't5-large': 512,
-    't5-3B': 512,
-    't5-11B': 512,
+    't5-3b': 512,
+    't5-11b': 512,
 }
 
 class T5Tokenizer(PreTrainedTokenizer):

From c8ed1c82c8a42ef700d4129d227fa356385c1d60 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Fri, 13 Dec 2019 12:13:48 -0500
Subject: [PATCH 068/110] [SQUAD] Load checkpoint when evaluating without
 training

---
 examples/run_squad.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 117b86e32c..a39915ee8b 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -580,10 +580,16 @@ def main():
     # Evaluation - we can ask to evaluate all the checkpoints (sub-directories) in a directory
     results = {}
     if args.do_eval and args.local_rank in [-1, 0]:
-        checkpoints = [args.output_dir]
-        if args.eval_all_checkpoints:
-            checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
-            logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+
+        if args.do_train:
+            logger.info("Loading checkpoints saved during training for evaluation")
+            checkpoints = [args.output_dir]
+            if args.eval_all_checkpoints:
+                checkpoints = list(os.path.dirname(c) for c in sorted(glob.glob(args.output_dir + '/**/' + WEIGHTS_NAME, recursive=True)))
+                logging.getLogger("transformers.modeling_utils").setLevel(logging.WARN)  # Reduce model loading logs
+        else:
+            logger.info("Loading checkpoint %s for evaluation", args.model_name_or_path)
+            checkpoints = [args.model_name_or_path]
 
         logger.info("Evaluate the following checkpoints: %s", checkpoints)
 

From f24a228a9315a4b723509bc9144b53d2bcbc4217 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Fri, 13 Dec 2019 14:50:35 -0500
Subject: [PATCH 069/110] Speed up tokenization process

---
 transformers/data/processors/squad.py |  2 +-
 transformers/tokenization_utils.py    | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 9bc4375684..e193f6153e 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -116,7 +116,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
     unique_id = 1000000000
 
     features = []
-    for (example_index, example) in enumerate(tqdm(examples)):
+    for (example_index, example) in enumerate(tqdm(examples, desc="Converting examples to features")):
         if is_training and not example.is_impossible:
             # Get start and end position
             start_position = example.start_position
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 317ecd167b..e87c87787b 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -637,9 +637,11 @@ class PreTrainedTokenizer(object):
             text: The sequence to be encoded.
             **kwargs: passed to the child `self.tokenize()` method
         """
+        all_special_tokens = self.all_special_tokens
+
         def lowercase_text(t):
             # convert non-special tokens to lowercase
-            escaped_special_toks = [re.escape(s_tok) for s_tok in self.all_special_tokens]
+            escaped_special_toks = [re.escape(s_tok) for s_tok in all_special_tokens]
             pattern = r'(^' + r'|'.join(escaped_special_toks) + r')|' + \
                       r'(.+?)'
             return re.sub(
@@ -680,17 +682,17 @@ class PreTrainedTokenizer(object):
                 tokenized_text = []
                 for sub_text in text_list:
                     if sub_text not in self.added_tokens_encoder \
-                            and sub_text not in self.all_special_tokens:
+                            and sub_text not in all_special_tokens:
                         tokenized_text += split_on_token(tok, sub_text)
                     else:
                         tokenized_text += [sub_text]
                 text_list = tokenized_text
 
             return list(itertools.chain.from_iterable((self._tokenize(token, **kwargs) if token not \
-                    in self.added_tokens_encoder and token not in self.all_special_tokens \
+                    in self.added_tokens_encoder and token not in all_special_tokens \
                     else [token] for token in tokenized_text)))
 
-        added_tokens = list(self.added_tokens_encoder.keys()) + self.all_special_tokens
+        added_tokens = list(self.added_tokens_encoder.keys()) + all_special_tokens
         tokenized_text = split_on_tokens(added_tokens, text)
         return tokenized_text
 

From d46147294852694d1dc701c72b9053ff2e726265 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Fri, 13 Dec 2019 15:31:52 -0500
Subject: [PATCH 070/110] return for SQuAD [BLACKED]

---
 transformers/data/processors/glue.py  |   2 +-
 transformers/data/processors/squad.py | 280 ++++++++++++++++----------
 2 files changed, 172 insertions(+), 110 deletions(-)

diff --git a/transformers/data/processors/glue.py b/transformers/data/processors/glue.py
index 518251b050..11ebd949de 100644
--- a/transformers/data/processors/glue.py
+++ b/transformers/data/processors/glue.py
@@ -133,7 +133,7 @@ def glue_convert_examples_to_features(examples, tokenizer,
     if is_tf_available() and is_tf_dataset:
         def gen():
             for ex in features:
-                yield  ({'input_ids': ex.input_ids,
+                yield ({'input_ids': ex.input_ids,
                          'attention_mask': ex.attention_mask,
                          'token_type_ids': ex.token_type_ids},
                         ex.label)
diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index e193f6153e..84aa429e26 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -18,19 +18,20 @@ if is_tf_available():
 
 logger = logging.getLogger(__name__)
 
-def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer,
-                        orig_answer_text):
+
+def _improve_answer_span(doc_tokens, input_start, input_end, tokenizer, orig_answer_text):
     """Returns tokenized answer spans that better match the annotated answer."""
     tok_answer_text = " ".join(tokenizer.tokenize(orig_answer_text))
 
     for new_start in range(input_start, input_end + 1):
         for new_end in range(input_end, new_start - 1, -1):
-            text_span = " ".join(doc_tokens[new_start:(new_end + 1)])
+            text_span = " ".join(doc_tokens[new_start : (new_end + 1)])
             if text_span == tok_answer_text:
                 return (new_start, new_end)
 
     return (input_start, input_end)
 
+
 def _check_is_max_context(doc_spans, cur_span_index, position):
     """Check if this is the 'max context' doc span for the token."""
     best_score = None
@@ -50,10 +51,11 @@ def _check_is_max_context(doc_spans, cur_span_index, position):
 
     return cur_span_index == best_span_index
 
+
 def _new_check_is_max_context(doc_spans, cur_span_index, position):
     """Check if this is the 'max context' doc span for the token."""
     # if len(doc_spans) == 1:
-        # return True
+    # return True
     best_score = None
     best_span_index = None
     for (span_index, doc_span) in enumerate(doc_spans):
@@ -71,14 +73,16 @@ def _new_check_is_max_context(doc_spans, cur_span_index, position):
 
     return cur_span_index == best_span_index
 
+
 def _is_whitespace(c):
     if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
         return True
     return False
 
-def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
-                                       doc_stride, max_query_length, is_training, 
-                                       return_dataset=False):
+
+def squad_convert_examples_to_features(
+    examples, tokenizer, max_seq_length, doc_stride, max_query_length, is_training, return_dataset=False
+):
     """
     Converts a list of examples into a list of features that can be directly given as input to a model.
     It is model-dependant and takes advantage of many of the tokenizer's features to create the model's inputs.
@@ -112,7 +116,7 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         )
     """
 
-    # Defining helper methods    
+    # Defining helper methods
     unique_id = 1000000000
 
     features = []
@@ -123,13 +127,12 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
             end_position = example.end_position
 
             # If the answer cannot be found in the text, then skip this example.
-            actual_text = " ".join(example.doc_tokens[start_position:(end_position + 1)])
+            actual_text = " ".join(example.doc_tokens[start_position : (end_position + 1)])
             cleaned_answer_text = " ".join(whitespace_tokenize(example.answer_text))
             if actual_text.find(cleaned_answer_text) == -1:
                 logger.warning("Could not find answer: '%s' vs. '%s'", actual_text, cleaned_answer_text)
                 continue
 
-
         tok_to_orig_index = []
         orig_to_tok_index = []
         all_doc_tokens = []
@@ -140,7 +143,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                 tok_to_orig_index.append(i)
                 all_doc_tokens.append(sub_token)
 
-
         if is_training and not example.is_impossible:
             tok_start_position = orig_to_tok_index[example.start_position]
             if example.end_position < len(example.doc_tokens) - 1:
@@ -153,36 +155,41 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
             )
 
         spans = []
-        
-        truncated_query = tokenizer.encode(example.question_text, add_special_tokens=False, max_length=max_query_length)
-        sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence 
-        sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair 
+
+        truncated_query = tokenizer.encode(
+            example.question_text, add_special_tokens=False, max_length=max_query_length
+        )
+        sequence_added_tokens = tokenizer.max_len - tokenizer.max_len_single_sentence
+        sequence_pair_added_tokens = tokenizer.max_len - tokenizer.max_len_sentences_pair
 
         span_doc_tokens = all_doc_tokens
         while len(spans) * doc_stride < len(all_doc_tokens):
-            
+
             encoded_dict = tokenizer.encode_plus(
-                truncated_query if tokenizer.padding_side == "right" else span_doc_tokens, 
-                span_doc_tokens if tokenizer.padding_side == "right" else truncated_query, 
-                max_length=max_seq_length, 
-                return_overflowing_tokens=True, 
+                truncated_query if tokenizer.padding_side == "right" else span_doc_tokens,
+                span_doc_tokens if tokenizer.padding_side == "right" else truncated_query,
+                max_length=max_seq_length,
+                return_overflowing_tokens=True,
                 pad_to_max_length=True,
                 stride=max_seq_length - doc_stride - len(truncated_query) - sequence_pair_added_tokens,
-                truncation_strategy='only_second' if tokenizer.padding_side == "right" else 'only_first'
+                truncation_strategy="only_second" if tokenizer.padding_side == "right" else "only_first",
             )
 
-            paragraph_len = min(len(all_doc_tokens) - len(spans) * doc_stride, max_seq_length - len(truncated_query) - sequence_pair_added_tokens)
+            paragraph_len = min(
+                len(all_doc_tokens) - len(spans) * doc_stride,
+                max_seq_length - len(truncated_query) - sequence_pair_added_tokens,
+            )
 
-            if tokenizer.pad_token_id in encoded_dict['input_ids']: 
-                non_padded_ids = encoded_dict['input_ids'][:encoded_dict['input_ids'].index(tokenizer.pad_token_id)]
+            if tokenizer.pad_token_id in encoded_dict["input_ids"]:
+                non_padded_ids = encoded_dict["input_ids"][: encoded_dict["input_ids"].index(tokenizer.pad_token_id)]
             else:
-                non_padded_ids = encoded_dict['input_ids']
+                non_padded_ids = encoded_dict["input_ids"]
 
             tokens = tokenizer.convert_ids_to_tokens(non_padded_ids)
 
             token_to_orig_map = {}
             for i in range(paragraph_len):
-                index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i 
+                index = len(truncated_query) + sequence_added_tokens + i if tokenizer.padding_side == "right" else i
                 token_to_orig_map[index] = tok_to_orig_index[len(spans) * doc_stride + i]
 
             encoded_dict["paragraph_len"] = paragraph_len
@@ -202,16 +209,20 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
         for doc_span_index in range(len(spans)):
             for j in range(spans[doc_span_index]["paragraph_len"]):
                 is_max_context = _new_check_is_max_context(spans, doc_span_index, doc_span_index * doc_stride + j)
-                index = j if tokenizer.padding_side == "left" else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+                index = (
+                    j
+                    if tokenizer.padding_side == "left"
+                    else spans[doc_span_index]["truncated_query_with_special_tokens_length"] + j
+                )
                 spans[doc_span_index]["token_is_max_context"][index] = is_max_context
 
         for span in spans:
             # Identify the position of the CLS token
-            cls_index = span['input_ids'].index(tokenizer.cls_token_id)
+            cls_index = span["input_ids"].index(tokenizer.cls_token_id)
 
             # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
             # Original TF implem also keep the classification token (set to 0) (not sure why...)
-            p_mask = np.array(span['token_type_ids'])
+            p_mask = np.array(span["token_type_ids"])
 
             p_mask = np.minimum(p_mask, 1)
 
@@ -224,7 +235,6 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
             # Set the CLS index to '0'
             p_mask[cls_index] = 0
 
-
             span_is_impossible = example.is_impossible
             start_position = 0
             end_position = 0
@@ -247,55 +257,99 @@ def squad_convert_examples_to_features(examples, tokenizer, max_seq_length,
                         doc_offset = 0
                     else:
                         doc_offset = len(truncated_query) + sequence_added_tokens
-                        
+
                     start_position = tok_start_position - doc_start + doc_offset
                     end_position = tok_end_position - doc_start + doc_offset
 
-
-            features.append(SquadFeatures(
-                span['input_ids'],
-                span['attention_mask'],
-                span['token_type_ids'],
-                cls_index,
-                p_mask.tolist(),
-
-                example_index=example_index,
-                unique_id=unique_id,
-                paragraph_len=span['paragraph_len'],
-                token_is_max_context=span["token_is_max_context"],
-                tokens=span["tokens"],
-                token_to_orig_map=span["token_to_orig_map"],
-                
-                start_position=start_position,
-                end_position=end_position
-            ))
+            features.append(
+                SquadFeatures(
+                    span["input_ids"],
+                    span["attention_mask"],
+                    span["token_type_ids"],
+                    cls_index,
+                    p_mask.tolist(),
+                    example_index=example_index,
+                    unique_id=unique_id,
+                    paragraph_len=span["paragraph_len"],
+                    token_is_max_context=span["token_is_max_context"],
+                    tokens=span["tokens"],
+                    token_to_orig_map=span["token_to_orig_map"],
+                    start_position=start_position,
+                    end_position=end_position,
+                )
+            )
 
             unique_id += 1
 
-    if return_dataset == 'pt':
+    if return_dataset == "pt":
         if not is_torch_available():
             raise ImportError("Pytorch must be installed to return a pytorch dataset.")
 
         # Convert to Tensors and build dataset
         all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
-        all_input_mask = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
-        all_segment_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
+        all_attention_masks = torch.tensor([f.attention_mask for f in features], dtype=torch.long)
+        all_token_type_ids = torch.tensor([f.token_type_ids for f in features], dtype=torch.long)
         all_cls_index = torch.tensor([f.cls_index for f in features], dtype=torch.long)
         all_p_mask = torch.tensor([f.p_mask for f in features], dtype=torch.float)
 
         if not is_training:
             all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)
-            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                    all_example_index, all_cls_index, all_p_mask)
+            dataset = TensorDataset(
+                all_input_ids, all_attention_masks, all_token_type_ids, all_example_index, all_cls_index, all_p_mask
+            )
         else:
             all_start_positions = torch.tensor([f.start_position for f in features], dtype=torch.long)
             all_end_positions = torch.tensor([f.end_position for f in features], dtype=torch.long)
-            dataset = TensorDataset(all_input_ids, all_input_mask, all_segment_ids,
-                                    all_start_positions, all_end_positions,
-                                    all_cls_index, all_p_mask)
+            dataset = TensorDataset(
+                all_input_ids,
+                all_attention_masks,
+                all_token_type_ids,
+                all_start_positions,
+                all_end_positions,
+                all_cls_index,
+                all_p_mask,
+            )
 
         return features, dataset
-        
+    elif return_dataset == "tf":
+        if not is_tf_available():
+            raise ImportError("TensorFlow must be installed to return a TensorFlow dataset.")
+
+        def gen():
+            for ex in features:
+                yield (
+                    {
+                        "input_ids": ex.input_ids,
+                        "attention_mask": ex.attention_mask,
+                        "token_type_ids": ex.token_type_ids,
+                    }, {
+                        "start_position": ex.start_position,
+                        "end_position": ex.end_position,
+                        "cls_index": ex.cls_index,
+                        "p_mask": ex.p_mask,
+                    }
+                )
+
+        return tf.data.Dataset.from_generator(
+            gen,
+            (
+                {"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32},
+                {"start_position": tf.int64, "end_position": tf.int64, "cls_index": tf.int64, "p_mask": tf.int32},
+            ),
+            (
+                {
+                    "input_ids": tf.TensorShape([None]),
+                    "attention_mask": tf.TensorShape([None]),
+                    "token_type_ids": tf.TensorShape([None]),
+                },
+                {
+                    "start_position": tf.TensorShape([]),
+                    "end_position": tf.TensorShape([]),
+                    "cls_index": tf.TensorShape([]),
+                    "p_mask": tf.TensorShape([None]),
+                },
+            ),
+        )
 
     return features
 
@@ -305,31 +359,32 @@ class SquadProcessor(DataProcessor):
     Processor for the SQuAD data set.
     Overriden by SquadV1Processor and SquadV2Processor, used by the version 1.1 and version 2.0 of SQuAD, respectively.
     """
+
     train_file = None
     dev_file = None
 
     def _get_example_from_tensor_dict(self, tensor_dict, evaluate=False):
         if not evaluate:
-            answer = tensor_dict['answers']['text'][0].numpy().decode('utf-8')
-            answer_start = tensor_dict['answers']['answer_start'][0].numpy()
+            answer = tensor_dict["answers"]["text"][0].numpy().decode("utf-8")
+            answer_start = tensor_dict["answers"]["answer_start"][0].numpy()
             answers = []
         else:
-            answers = [{
-                "answer_start": start.numpy(), 
-                "text": text.numpy().decode('utf-8')
-            } for start, text in zip(tensor_dict['answers']["answer_start"], tensor_dict['answers']["text"])]
+            answers = [
+                {"answer_start": start.numpy(), "text": text.numpy().decode("utf-8")}
+                for start, text in zip(tensor_dict["answers"]["answer_start"], tensor_dict["answers"]["text"])
+            ]
 
             answer = None
             answer_start = None
 
         return SquadExample(
-            qas_id=tensor_dict['id'].numpy().decode("utf-8"),
-            question_text=tensor_dict['question'].numpy().decode('utf-8'),
-            context_text=tensor_dict['context'].numpy().decode('utf-8'),
+            qas_id=tensor_dict["id"].numpy().decode("utf-8"),
+            question_text=tensor_dict["question"].numpy().decode("utf-8"),
+            context_text=tensor_dict["context"].numpy().decode("utf-8"),
             answer_text=answer,
             start_position_character=answer_start,
-            title=tensor_dict['title'].numpy().decode('utf-8'),
-            answers=answers
+            title=tensor_dict["title"].numpy().decode("utf-8"),
+            answers=answers,
         )
 
     def get_examples_from_dataset(self, dataset, evaluate=False):
@@ -359,7 +414,7 @@ class SquadProcessor(DataProcessor):
 
         examples = []
         for tensor_dict in tqdm(dataset):
-            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate)) 
+            examples.append(self._get_example_from_tensor_dict(tensor_dict, evaluate=evaluate))
 
         return examples
 
@@ -379,7 +434,9 @@ class SquadProcessor(DataProcessor):
         if self.train_file is None:
             raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
 
-        with open(os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding='utf-8') as reader:
+        with open(
+            os.path.join(data_dir, self.train_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
             input_data = json.load(reader)["data"]
         return self._create_examples(input_data, "train")
 
@@ -397,8 +454,10 @@ class SquadProcessor(DataProcessor):
 
         if self.dev_file is None:
             raise ValueError("SquadProcessor should be instantiated via SquadV1Processor or SquadV2Processor")
-        
-        with open(os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding='utf-8') as reader:
+
+        with open(
+            os.path.join(data_dir, self.dev_file if filename is None else filename), "r", encoding="utf-8"
+        ) as reader:
             input_data = json.load(reader)["data"]
         return self._create_examples(input_data, "dev")
 
@@ -406,7 +465,7 @@ class SquadProcessor(DataProcessor):
         is_training = set_type == "train"
         examples = []
         for entry in tqdm(input_data):
-            title = entry['title']
+            title = entry["title"]
             for paragraph in entry["paragraphs"]:
                 context_text = paragraph["context"]
                 for qa in paragraph["qas"]:
@@ -415,7 +474,7 @@ class SquadProcessor(DataProcessor):
                     start_position_character = None
                     answer_text = None
                     answers = []
-                    
+
                     if "is_impossible" in qa:
                         is_impossible = qa["is_impossible"]
                     else:
@@ -424,8 +483,8 @@ class SquadProcessor(DataProcessor):
                     if not is_impossible:
                         if is_training:
                             answer = qa["answers"][0]
-                            answer_text = answer['text']
-                            start_position_character = answer['answer_start']
+                            answer_text = answer["text"]
+                            start_position_character = answer["answer_start"]
                         else:
                             answers = qa["answers"]
 
@@ -437,12 +496,13 @@ class SquadProcessor(DataProcessor):
                         start_position_character=start_position_character,
                         title=title,
                         is_impossible=is_impossible,
-                        answers=answers
+                        answers=answers,
                     )
 
                     examples.append(example)
         return examples
 
+
 class SquadV1Processor(SquadProcessor):
     train_file = "train-v1.1.json"
     dev_file = "dev-v1.1.json"
@@ -451,7 +511,7 @@ class SquadV1Processor(SquadProcessor):
 class SquadV2Processor(SquadProcessor):
     train_file = "train-v2.0.json"
     dev_file = "dev-v2.0.json"
-    
+
 
 class SquadExample(object):
     """
@@ -468,21 +528,23 @@ class SquadExample(object):
         is_impossible: False by default, set to True if the example has no possible answer.
     """
 
-    def __init__(self,
-                 qas_id,
-                 question_text,
-                 context_text,
-                 answer_text,
-                 start_position_character,
-                 title,
-                 answers=[],
-                 is_impossible=False):
+    def __init__(
+        self,
+        qas_id,
+        question_text,
+        context_text,
+        answer_text,
+        start_position_character,
+        title,
+        answers=[],
+        is_impossible=False,
+    ):
         self.qas_id = qas_id
         self.question_text = question_text
         self.context_text = context_text
         self.answer_text = answer_text
         self.title = title
-        self.is_impossible = is_impossible 
+        self.is_impossible = is_impossible
         self.answers = answers
 
         self.start_position, self.end_position = 0, 0
@@ -537,24 +599,23 @@ class SquadFeatures(object):
         end_position: end of the answer token index 
     """
 
-    def __init__(self,
-                 input_ids,
-                 attention_mask,
-                 token_type_ids,
-                 cls_index,
-                 p_mask,
-                 
-                 example_index,
-                 unique_id,
-                 paragraph_len,
-                 token_is_max_context,
-                 tokens,
-                 token_to_orig_map,
-
-                 start_position,
-                 end_position
-        ):
-        self.input_ids = input_ids 
+    def __init__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        cls_index,
+        p_mask,
+        example_index,
+        unique_id,
+        paragraph_len,
+        token_is_max_context,
+        tokens,
+        token_to_orig_map,
+        start_position,
+        end_position,
+    ):
+        self.input_ids = input_ids
         self.attention_mask = attention_mask
         self.token_type_ids = token_type_ids
         self.cls_index = cls_index
@@ -580,12 +641,13 @@ class SquadResult(object):
         start_logits: The logits corresponding to the start of the answer
         end_logits: The logits corresponding to the end of the answer
     """
+
     def __init__(self, unique_id, start_logits, end_logits, start_top_index=None, end_top_index=None, cls_logits=None):
         self.start_logits = start_logits
         self.end_logits = end_logits
         self.unique_id = unique_id
-        
+
         if start_top_index:
             self.start_top_index = start_top_index
             self.end_top_index = end_top_index
-            self.cls_logits = cls_logits
\ No newline at end of file
+            self.cls_logits = cls_logits

From 866d73ca26a13d7e378b2f88f365cb0807c47805 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Fri, 13 Dec 2019 16:09:23 -0500
Subject: [PATCH 071/110] [cli] Upload is now compatible with folders

---
 transformers/commands/user.py | 57 ++++++++++++++++++++++++++---------
 1 file changed, 43 insertions(+), 14 deletions(-)

diff --git a/transformers/commands/user.py b/transformers/commands/user.py
index d79922ed8a..8e0e563422 100644
--- a/transformers/commands/user.py
+++ b/transformers/commands/user.py
@@ -19,8 +19,8 @@ class UserCommands(BaseTransformersCLICommand):
         list_parser.set_defaults(func=lambda args: ListObjsCommand(args))
         # upload
         upload_parser = parser.add_parser('upload')
-        upload_parser.add_argument('file', type=str, help='Local filepath of the file to upload.')
-        upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override object filename on S3.')
+        upload_parser.add_argument('path', type=str, help='Local path of the folder or individual file to upload.')
+        upload_parser.add_argument('--filename', type=str, default=None, help='Optional: override individual object filename on S3.')
         upload_parser.set_defaults(func=lambda args: UploadCommand(args))
 
 
@@ -138,28 +138,57 @@ class ListObjsCommand(BaseUserCommand):
 
 
 class UploadCommand(BaseUserCommand):
+    def walk_dir(self, rel_path):
+        """
+        Recursively list all files in a folder.
+        """
+        entries: List[os.DirEntry] = list(os.scandir(rel_path))
+        files = [
+            (
+                os.path.join(os.getcwd(), f.path),  # filepath
+                f.path  # filename
+            )
+            for f in entries if f.is_file()
+        ]
+        for f in entries:
+            if f.is_dir():
+                files += self.walk_dir(f.path)
+        return files
+
     def run(self):
         token = HfFolder.get_token()
         if token is None:
             print("Not logged in")
             exit(1)
-        filepath = os.path.join(os.getcwd(), self.args.file)
-        filename = self.args.filename if self.args.filename is not None else os.path.basename(filepath)
-        print(
-            "About to upload file {} to S3 under filename {}".format(
-                ANSI.bold(filepath), ANSI.bold(filename)
+        local_path = os.path.abspath(self.args.path)
+        if os.path.isdir(local_path):
+            if self.args.filename is not None:
+                raise ValueError("Cannot specify a filename override when uploading a folder.")
+            rel_path = os.path.basename(local_path)
+            files = self.walk_dir(rel_path)
+        elif os.path.isfile(local_path):
+            filename = self.args.filename if self.args.filename is not None else os.path.basename(local_path)
+            files = [(local_path, filename)]
+        else:
+            raise ValueError("Not a valid file or directory: {}".format(local_path))
+
+        for filepath, filename in files:
+            print(
+                "About to upload file {} to S3 under filename {}".format(
+                    ANSI.bold(filepath), ANSI.bold(filename)
+                )
             )
-        )
 
         choice = input("Proceed? [Y/n] ").lower()
         if not(choice == "" or choice == "y" or choice == "yes"):
             print("Abort")
             exit()
         print(
-            ANSI.bold("Uploading... This might take a while if file is large")
+            ANSI.bold("Uploading... This might take a while if files are large")
         )
-        access_url = self._api.presign_and_upload(
-            token=token, filename=filename, filepath=filepath
-        )
-        print("Your file now lives at:")
-        print(access_url)
+        for filepath, filename in files:
+            access_url = self._api.presign_and_upload(
+                token=token, filename=filename, filepath=filepath
+            )
+            print("Your file now lives at:")
+            print(access_url)

From 5b7b78e088352a3aaf1f80d26bb1cd466bc2ac64 Mon Sep 17 00:00:00 2001
From: Pascal Voitot <p.voitot@samsung.com>
Date: Sun, 8 Dec 2019 23:22:02 +0100
Subject: [PATCH 072/110] :bug: #2096 in tokenizer.decode, adds a space after
 special tokens to return right formatted string

---
 transformers/tokenization_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index e87c87787b..42519c26ba 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -1180,7 +1180,7 @@ class PreTrainedTokenizer(object):
                 if current_sub_text:
                     sub_texts.append(self.convert_tokens_to_string(current_sub_text))
                     current_sub_text = []
-                sub_texts.append(" " + token)
+                sub_texts.append(" " + token + " ")
             else:
                 current_sub_text.append(token)
         if current_sub_text:

From df160af736cba1d50c09abcf92c8fc6c00bcdb13 Mon Sep 17 00:00:00 2001
From: Pascal Voitot <p.voitot@samsung.com>
Date: Tue, 10 Dec 2019 00:03:38 +0100
Subject: [PATCH 073/110] :bug: #2096 in tokenizer.decode, space is not joined
 between all subtexts instead of before added tokens

---
 transformers/tests/tokenization_bert_test.py | 16 ++++++++++++++++
 transformers/tokenization_utils.py           |  4 ++--
 2 files changed, 18 insertions(+), 2 deletions(-)

diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
index f390248956..c47f149e9a 100644
--- a/transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -99,6 +99,21 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         self.assertListEqual(
             tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
 
+    def test_encode_decode_with_spaces(self):
+        tokenizer = self.get_tokenizer()
+
+        new_toks = ['[ABC]', '[DEF]', 'GHI IHG']
+        tokenizer.add_tokens(new_toks)
+        input = "unwanted running [ABC] [DEF] running unwanted [ABC] GHI IHG unwanted [DEF]"
+        encoded = tokenizer.encode(input)
+        decoded = tokenizer.decode(encoded)
+        self.assertEqual(
+            decoded.lower(),
+            (f"[CLS] {input.lower()} [SEP]").lower()
+        )
+
+
+
     def test_is_whitespace(self):
         self.assertTrue(_is_whitespace(u" "))
         self.assertTrue(_is_whitespace(u"\t"))
@@ -139,5 +154,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         assert encoded_sentence == [101] + text + [102]
         assert encoded_pair == [101] + text + [102] + text_2 + [102]
 
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/transformers/tokenization_utils.py b/transformers/tokenization_utils.py
index 42519c26ba..8aef80fec8 100644
--- a/transformers/tokenization_utils.py
+++ b/transformers/tokenization_utils.py
@@ -1180,12 +1180,12 @@ class PreTrainedTokenizer(object):
                 if current_sub_text:
                     sub_texts.append(self.convert_tokens_to_string(current_sub_text))
                     current_sub_text = []
-                sub_texts.append(" " + token + " ")
+                sub_texts.append(token)
             else:
                 current_sub_text.append(token)
         if current_sub_text:
             sub_texts.append(self.convert_tokens_to_string(current_sub_text))
-        text = ''.join(sub_texts)
+        text = ' '.join(sub_texts)
 
         if clean_up_tokenization_spaces:
             clean_text = self.clean_up_tokenization(text)

From dd2add9f6efdaa248f3074b865dc67c439b30a4d Mon Sep 17 00:00:00 2001
From: Pascal Voitot <p.voitot@samsung.com>
Date: Tue, 10 Dec 2019 00:29:44 +0100
Subject: [PATCH 074/110] more tests

---
 transformers/tests/tokenization_bert_test.py |  2 +-
 transformers/tests/tokenization_gpt2_test.py | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
index c47f149e9a..b93934dd67 100644
--- a/transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -109,7 +109,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         decoded = tokenizer.decode(encoded)
         self.assertEqual(
             decoded.lower(),
-            (f"[CLS] {input.lower()} [SEP]").lower()
+            (f"[CLS] {input} [SEP]").lower()
         )
 
 
diff --git a/transformers/tests/tokenization_gpt2_test.py b/transformers/tests/tokenization_gpt2_test.py
index a77cc75ec2..9e6ca3c4fd 100644
--- a/transformers/tests/tokenization_gpt2_test.py
+++ b/transformers/tests/tokenization_gpt2_test.py
@@ -67,6 +67,20 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
+    def test_encode_decode_with_spaces(self):
+        tokenizer = self.get_tokenizer()
+
+        new_toks = ['[ABC]', '[DEF]', 'GHI IHG']
+        tokenizer.add_tokens(new_toks)
+        input = "lower newer [ABC] [DEF] newer lower [ABC] GHI IHG newer lower[DEF]"
+        encoded = tokenizer.encode(input)
+        decoded = tokenizer.decode(encoded)
+        self.assertEqual(
+            decoded.lower(),
+            input.lower()
+        )
+
+
 
 if __name__ == '__main__':
     unittest.main()

From 4cbdc7d910a0a12871a8e29760a3a6721a138421 Mon Sep 17 00:00:00 2001
From: Pascal Voitot <p.voitot@samsung.com>
Date: Tue, 10 Dec 2019 09:37:15 +0100
Subject: [PATCH 075/110] missed space

---
 transformers/tests/tokenization_bert_test.py | 2 --
 transformers/tests/tokenization_gpt2_test.py | 2 +-
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
index b93934dd67..a039a24dd8 100644
--- a/transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -112,8 +112,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
             (f"[CLS] {input} [SEP]").lower()
         )
 
-
-
     def test_is_whitespace(self):
         self.assertTrue(_is_whitespace(u" "))
         self.assertTrue(_is_whitespace(u"\t"))
diff --git a/transformers/tests/tokenization_gpt2_test.py b/transformers/tests/tokenization_gpt2_test.py
index 9e6ca3c4fd..1b4fe42874 100644
--- a/transformers/tests/tokenization_gpt2_test.py
+++ b/transformers/tests/tokenization_gpt2_test.py
@@ -72,7 +72,7 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
 
         new_toks = ['[ABC]', '[DEF]', 'GHI IHG']
         tokenizer.add_tokens(new_toks)
-        input = "lower newer [ABC] [DEF] newer lower [ABC] GHI IHG newer lower[DEF]"
+        input = "lower newer [ABC] [DEF] newer lower [ABC] GHI IHG newer lower [DEF]"
         encoded = tokenizer.encode(input)
         decoded = tokenizer.decode(encoded)
         self.assertEqual(

From f2ac50cb5560e13d941f1ea3dec3399f12f7a3fb Mon Sep 17 00:00:00 2001
From: Pascal Voitot <p.voitot@samsung.com>
Date: Tue, 10 Dec 2019 09:58:06 +0100
Subject: [PATCH 076/110] better for python2.x

---
 transformers/tests/tokenization_bert_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
index a039a24dd8..77b124cdf2 100644
--- a/transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -109,7 +109,7 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         decoded = tokenizer.decode(encoded)
         self.assertEqual(
             decoded.lower(),
-            (f"[CLS] {input} [SEP]").lower()
+            ("[CLS] " + input + " [SEP]").lower()
         )
 
     def test_is_whitespace(self):

From c3248cf122014dce10c0c8d0e663a95c948493e3 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 11 Dec 2019 12:36:37 -0500
Subject: [PATCH 077/110] Tests for all tokenizers

---
 transformers/tests/tokenization_bert_test.py     | 13 -------------
 transformers/tests/tokenization_gpt2_test.py     | 15 ---------------
 transformers/tests/tokenization_tests_commons.py |  9 +++++++++
 3 files changed, 9 insertions(+), 28 deletions(-)

diff --git a/transformers/tests/tokenization_bert_test.py b/transformers/tests/tokenization_bert_test.py
index 77b124cdf2..c503ea5e1e 100644
--- a/transformers/tests/tokenization_bert_test.py
+++ b/transformers/tests/tokenization_bert_test.py
@@ -99,19 +99,6 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         self.assertListEqual(
             tokenizer.tokenize("unwantedX running"), ["[UNK]", "runn", "##ing"])
 
-    def test_encode_decode_with_spaces(self):
-        tokenizer = self.get_tokenizer()
-
-        new_toks = ['[ABC]', '[DEF]', 'GHI IHG']
-        tokenizer.add_tokens(new_toks)
-        input = "unwanted running [ABC] [DEF] running unwanted [ABC] GHI IHG unwanted [DEF]"
-        encoded = tokenizer.encode(input)
-        decoded = tokenizer.decode(encoded)
-        self.assertEqual(
-            decoded.lower(),
-            ("[CLS] " + input + " [SEP]").lower()
-        )
-
     def test_is_whitespace(self):
         self.assertTrue(_is_whitespace(u" "))
         self.assertTrue(_is_whitespace(u"\t"))
diff --git a/transformers/tests/tokenization_gpt2_test.py b/transformers/tests/tokenization_gpt2_test.py
index 1b4fe42874..5eae767bdf 100644
--- a/transformers/tests/tokenization_gpt2_test.py
+++ b/transformers/tests/tokenization_gpt2_test.py
@@ -67,20 +67,5 @@ class GPT2TokenizationTest(CommonTestCases.CommonTokenizerTester):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
-    def test_encode_decode_with_spaces(self):
-        tokenizer = self.get_tokenizer()
-
-        new_toks = ['[ABC]', '[DEF]', 'GHI IHG']
-        tokenizer.add_tokens(new_toks)
-        input = "lower newer [ABC] [DEF] newer lower [ABC] GHI IHG newer lower [DEF]"
-        encoded = tokenizer.encode(input)
-        decoded = tokenizer.decode(encoded)
-        self.assertEqual(
-            decoded.lower(),
-            input.lower()
-        )
-
-
-
 if __name__ == '__main__':
     unittest.main()
diff --git a/transformers/tests/tokenization_tests_commons.py b/transformers/tests/tokenization_tests_commons.py
index c009958135..13e7ae746a 100644
--- a/transformers/tests/tokenization_tests_commons.py
+++ b/transformers/tests/tokenization_tests_commons.py
@@ -232,6 +232,15 @@ class CommonTestCases:
             self.assertNotEqual(len(tokens_2), 0)
             self.assertIsInstance(text_2, (str, unicode))
 
+        def test_encode_decode_with_spaces(self):
+            tokenizer = self.get_tokenizer()
+
+            new_toks = ['[ABC]', '[DEF]', 'GHI IHG']
+            tokenizer.add_tokens(new_toks)
+            input = "[ABC] [DEF] [ABC] GHI IHG [DEF]"
+            encoded = tokenizer.encode(input, add_special_tokens=False)
+            decoded = tokenizer.decode(encoded)
+            self.assertEqual(decoded, input)
 
         def test_pretrained_model_lists(self):
             weights_list = list(self.tokenizer_class.max_model_input_sizes.keys())

From 7bd11dda6f43656cf0a3891b7f61a67196d233b4 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Fri, 13 Dec 2019 16:45:30 -0500
Subject: [PATCH 078/110] Release: v2.2.2

---
 README.md                | 2 +-
 docs/source/conf.py      | 2 +-
 setup.py                 | 2 +-
 transformers/__init__.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index f3aa8a95ee..f24ceaa6d2 100644
--- a/README.md
+++ b/README.md
@@ -58,7 +58,7 @@ Choose the right framework for every part of a model's lifetime
 | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
 | [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
 | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
-| [Documentation][(v2.2.0/v2.2.1)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
+| [Documentation][(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
 
 ## Installation
 
diff --git a/docs/source/conf.py b/docs/source/conf.py
index 2f8505ab3a..99b7b44922 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -26,7 +26,7 @@ author = u'huggingface'
 # The short X.Y version
 version = u''
 # The full version, including alpha/beta/rc tags
-release = u'2.2.1'
+release = u'2.2.2'
 
 
 # -- General configuration ---------------------------------------------------
diff --git a/setup.py b/setup.py
index c4af32df83..eacb5ecec0 100644
--- a/setup.py
+++ b/setup.py
@@ -44,7 +44,7 @@ extras['all'] = [package for package in extras.values()]
 
 setup(
     name="transformers",
-    version="2.2.1",
+    version="2.2.2",
     author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors, Facebook AI Authors, Carnegie Mellon University Authors",
     author_email="thomas@huggingface.co",
     description="State-of-the-art Natural Language Processing for TensorFlow 2.0 and PyTorch",
diff --git a/transformers/__init__.py b/transformers/__init__.py
index 5d7b0b772c..c11919f0a7 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "2.2.1"
+__version__ = "2.2.2"
 
 # Work around to update TensorFlow's absl.logging threshold which alters the
 # default Python logging output behavior when present.

From b6d4284b26c0ab5e736cb7838b27b720225feeb7 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Fri, 13 Dec 2019 22:43:15 -0500
Subject: [PATCH 079/110] [cli] Uploads: fix + test edge case

---
 transformers/hf_api.py                |  3 +-
 transformers/tests/fixtures/empty.txt |  0
 transformers/tests/hf_api_test.py     | 44 +++++++++++++++++++--------
 3 files changed, 33 insertions(+), 14 deletions(-)
 create mode 100644 transformers/tests/fixtures/empty.txt

diff --git a/transformers/hf_api.py b/transformers/hf_api.py
index 3bbb6c567a..170732339a 100644
--- a/transformers/hf_api.py
+++ b/transformers/hf_api.py
@@ -131,8 +131,9 @@ class HfApi:
         # the client still has to specify it when uploading the file.
         with open(filepath, "rb") as f:
             pf = TqdmProgressFileReader(f)
+            data = f if pf.total_size > 0 else ""
 
-            r = requests.put(urls.write, data=f, headers={
+            r = requests.put(urls.write, data=data, headers={
                 "content-type": urls.type,
             })
             r.raise_for_status()
diff --git a/transformers/tests/fixtures/empty.txt b/transformers/tests/fixtures/empty.txt
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/transformers/tests/hf_api_test.py b/transformers/tests/hf_api_test.py
index 92d41b6dff..b45f5aceed 100644
--- a/transformers/tests/hf_api_test.py
+++ b/transformers/tests/hf_api_test.py
@@ -15,18 +15,30 @@
 from __future__ import absolute_import, division, print_function
 
 import os
-import six
 import time
 import unittest
 
-from transformers.hf_api import HfApi, S3Obj, PresignedUrl, HfFolder, HTTPError
+import requests
+import six
+
+from transformers.hf_api import HfApi, HfFolder, HTTPError, PresignedUrl, S3Obj
 
 USER = "__DUMMY_TRANSFORMERS_USER__"
 PASS = "__DUMMY_TRANSFORMERS_PASS__"
-FILE_KEY = "Test-{}.txt".format(int(time.time()))
-FILE_PATH = os.path.join(
-    os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
-)
+FILES = [
+    (
+        "Test-{}.txt".format(int(time.time())),
+        os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "fixtures/input.txt"
+        )
+    ),
+    (
+        "yoyo {}.txt".format(int(time.time())), # space is intentional
+        os.path.join(
+            os.path.dirname(os.path.abspath(__file__)), "fixtures/empty.txt"
+        )
+    ),
+]
 
 
 
@@ -57,15 +69,21 @@ class HfApiEndpointsTest(HfApiCommonTest):
         self.assertEqual(user, USER)
 
     def test_presign(self):
-        urls = self._api.presign(token=self._token, filename=FILE_KEY)
-        self.assertIsInstance(urls, PresignedUrl)
-        self.assertEqual(urls.type, "text/plain")
+        for FILE_KEY, FILE_PATH in FILES:
+            urls = self._api.presign(token=self._token, filename=FILE_KEY)
+            self.assertIsInstance(urls, PresignedUrl)
+            self.assertEqual(urls.type, "text/plain")
 
     def test_presign_and_upload(self):
-        access_url = self._api.presign_and_upload(
-            token=self._token, filename=FILE_KEY, filepath=FILE_PATH
-        )
-        self.assertIsInstance(access_url, six.string_types)
+        for FILE_KEY, FILE_PATH in FILES:
+            access_url = self._api.presign_and_upload(
+                token=self._token, filename=FILE_KEY, filepath=FILE_PATH
+            )
+            self.assertIsInstance(access_url, six.string_types)
+            with open(FILE_PATH, 'r') as f:
+                body = f.read()
+            r = requests.get(access_url)
+            self.assertEqual(r.text, body)
 
     def test_list_objs(self):
         objs = self._api.list_objs(token=self._token)

From cbb368ca06998e5d98684bc622e1d8c68ba1d88f Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 14 Dec 2019 09:31:18 +0100
Subject: [PATCH 080/110] distilbert tests

---
 transformers/tests/modeling_common_test.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/transformers/tests/modeling_common_test.py b/transformers/tests/modeling_common_test.py
index cd4cf247a6..8920e8b826 100644
--- a/transformers/tests/modeling_common_test.py
+++ b/transformers/tests/modeling_common_test.py
@@ -96,9 +96,7 @@ class CommonTestCases:
 
                     # Make sure we don't have nans
                     out_1 = after_outputs[0].cpu().numpy()
-                    out_2 = outputs[0].cpu().numpy()
-                    out_1 = out_1[~np.isnan(out_1)]
-                    out_2 = out_2[~np.isnan(out_2)]
+                    out_1[np.isnan(out_1)] = 0
                     max_diff = np.amax(np.abs(out_1 - out_2))
                     self.assertLessEqual(max_diff, 1e-5)
 

From 7140363e092fecf82b73edd423bed3376ec1e150 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Sat, 14 Dec 2019 09:44:53 +0100
Subject: [PATCH 081/110] update bertabs

---
 .../summarization/configuration_bertabs.py    | 48 ++++++-------------
 1 file changed, 14 insertions(+), 34 deletions(-)

diff --git a/examples/summarization/configuration_bertabs.py b/examples/summarization/configuration_bertabs.py
index 054763ea93..b862d58d2b 100644
--- a/examples/summarization/configuration_bertabs.py
+++ b/examples/summarization/configuration_bertabs.py
@@ -33,6 +33,8 @@ class BertAbsConfig(PretrainedConfig):
     r""" Class to store the configuration of the BertAbs model.
 
     Arguments:
+        vocab_size: int
+            Number of tokens in the vocabulary.
         max_pos: int
             The maximum sequence length that this model will be used with.
         enc_layer: int
@@ -81,39 +83,17 @@ class BertAbsConfig(PretrainedConfig):
     ):
         super(BertAbsConfig, self).__init__(**kwargs)
 
-        if self._input_is_path_to_json(vocab_size):
-            path_to_json = vocab_size
-            with open(path_to_json, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif isinstance(vocab_size, int):
-            self.vocab_size = vocab_size
-            self.max_pos = max_pos
+        self.vocab_size = vocab_size
+        self.max_pos = max_pos
 
-            self.enc_layers = enc_layers
-            self.enc_hidden_size = enc_hidden_size
-            self.enc_heads = enc_heads
-            self.enc_ff_size = enc_ff_size
-            self.enc_dropout = enc_dropout
+        self.enc_layers = enc_layers
+        self.enc_hidden_size = enc_hidden_size
+        self.enc_heads = enc_heads
+        self.enc_ff_size = enc_ff_size
+        self.enc_dropout = enc_dropout
 
-            self.dec_layers = dec_layers
-            self.dec_hidden_size = dec_hidden_size
-            self.dec_heads = dec_heads
-            self.dec_ff_size = dec_ff_size
-            self.dec_dropout = dec_dropout
-        else:
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
-
-    def _input_is_path_to_json(self, first_argument):
-        """ Checks whether the first argument passed to config
-        is the path to a JSON file that contains the config.
-        """
-        is_python_2 = sys.version_info[0] == 2
-        if is_python_2:
-            return isinstance(first_argument, unicode)
-        else:
-            return isinstance(first_argument, str)
+        self.dec_layers = dec_layers
+        self.dec_hidden_size = dec_hidden_size
+        self.dec_heads = dec_heads
+        self.dec_ff_size = dec_ff_size
+        self.dec_dropout = dec_dropout

From 1b8613acb32a568db8d9b74ee182d43c4f8e9cbb Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 16 Dec 2019 09:51:42 +0100
Subject: [PATCH 082/110] updating t5 config class

---
 transformers/configuration_t5.py       | 15 ++-------------
 transformers/tests/modeling_t5_test.py |  2 +-
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/transformers/configuration_t5.py b/transformers/configuration_t5.py
index 6391cb4180..377a0919d9 100644
--- a/transformers/configuration_t5.py
+++ b/transformers/configuration_t5.py
@@ -66,7 +66,7 @@ class T5Config(PretrainedConfig):
     pretrained_config_archive_map = T5_PRETRAINED_CONFIG_ARCHIVE_MAP
 
     def __init__(self,
-                 vocab_size_or_config_json_file=32128,
+                 vocab_size=32128,
                  n_positions=512,
                  d_model=512,
                  d_kv=64,
@@ -79,7 +79,7 @@ class T5Config(PretrainedConfig):
                  initializer_factor=1.0,
                  **kwargs):
         super(T5Config, self).__init__(**kwargs)
-        self.vocab_size = vocab_size_or_config_json_file if isinstance(vocab_size_or_config_json_file, int) else -1
+        self.vocab_size = vocab_size
         self.n_positions = n_positions
         self.d_model = d_model
         self.d_kv = d_kv
@@ -91,17 +91,6 @@ class T5Config(PretrainedConfig):
         self.layer_norm_epsilon = layer_norm_epsilon
         self.initializer_factor = initializer_factor
 
-        if isinstance(vocab_size_or_config_json_file, six.string_types):
-            with open(vocab_size_or_config_json_file, "r", encoding="utf-8") as reader:
-                json_config = json.loads(reader.read())
-            for key, value in json_config.items():
-                self.__dict__[key] = value
-        elif not isinstance(vocab_size_or_config_json_file, int):
-            raise ValueError(
-                "First argument must be either a vocabulary size (int)"
-                "or the path to a pretrained model config file (str)"
-            )
-
     @property
     def max_position_embeddings(self):
         return self.n_positions
diff --git a/transformers/tests/modeling_t5_test.py b/transformers/tests/modeling_t5_test.py
index a539cc868a..c337163375 100644
--- a/transformers/tests/modeling_t5_test.py
+++ b/transformers/tests/modeling_t5_test.py
@@ -93,7 +93,7 @@ class T5ModelTest(CommonTestCases.CommonModelTester):
                 decoder_lm_labels = ids_tensor([self.batch_size, self.decoder_seq_length], self.vocab_size)
 
             config = T5Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_positions=self.n_positions,
                 d_model=self.hidden_size,
                 d_ff=self.d_ff,

From 8669598abd7af877bd33890d62ae70ec1623f145 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 16 Dec 2019 09:59:36 +0100
Subject: [PATCH 083/110] update t5 tf

---
 transformers/tests/modeling_tf_t5_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/tests/modeling_tf_t5_test.py b/transformers/tests/modeling_tf_t5_test.py
index 99eec313f9..b905a9875b 100644
--- a/transformers/tests/modeling_tf_t5_test.py
+++ b/transformers/tests/modeling_tf_t5_test.py
@@ -87,7 +87,7 @@ class TFT5ModelTest(TFCommonTestCases.TFCommonModelTester):
                 token_labels = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
             config = T5Config(
-                vocab_size_or_config_json_file=self.vocab_size,
+                vocab_size=self.vocab_size,
                 n_positions=self.n_positions,
                 d_model=self.hidden_size,
                 d_ff=self.d_ff,

From 56e98ba81a9a7410243a1117fb6148d5f353ef98 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 16 Dec 2019 11:07:27 +0100
Subject: [PATCH 084/110] add model cards cc @mfuntowicz

---
 transformers/__init__.py              |   3 +
 transformers/file_utils.py            |   2 +-
 transformers/model_card.py            | 248 ++++++++++++++++++++++++++
 transformers/tests/model_card_test.py |  87 +++++++++
 4 files changed, 339 insertions(+), 1 deletion(-)
 create mode 100644 transformers/model_card.py
 create mode 100644 transformers/tests/model_card_test.py

diff --git a/transformers/__init__.py b/transformers/__init__.py
index 740d2440c2..15c167a5ce 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -33,6 +33,9 @@ from .data import (is_sklearn_available,
 if is_sklearn_available():
     from .data import glue_compute_metrics, xnli_compute_metrics
 
+# Model Cards
+from .model_card import ModelCard
+
 # Tokenizers
 from .tokenization_utils import (PreTrainedTokenizer)
 from .tokenization_auto import AutoTokenizer
diff --git a/transformers/file_utils.py b/transformers/file_utils.py
index 03b2fdb9f4..81c9b8002f 100644
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -72,7 +72,7 @@ WEIGHTS_NAME = "pytorch_model.bin"
 TF2_WEIGHTS_NAME = 'tf_model.h5'
 TF_WEIGHTS_NAME = 'model.ckpt'
 CONFIG_NAME = "config.json"
-
+MODEL_CARD_NAME = "model_card.json"
 
 DUMMY_INPUTS = [[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]]
 DUMMY_MASK = [[1, 1, 1, 1, 1], [1, 1, 1, 0, 0], [0, 0, 0, 1, 1]]
diff --git a/transformers/model_card.py b/transformers/model_card.py
new file mode 100644
index 0000000000..679c24872a
--- /dev/null
+++ b/transformers/model_card.py
@@ -0,0 +1,248 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Configuration base class and utilities."""
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import copy
+import json
+import logging
+import os
+import re
+from io import open
+
+from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, cached_path, is_remote_url, hf_bucket_url
+
+
+logger = logging.getLogger(__name__)
+
+
+ALL_MODELS_MAP = dict((key, value)
+    for pretrained_map in [
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ]
+    for key, value, in pretrained_map.items())
+
+
+class ModelCard(object):
+    r""" Model Card class.
+        Store model card as well as methods for loading/downloading/saving model cards.
+
+        Please read the following paper for details and explanation on the sections:
+            "Model Cards for Model Reporting"
+                by Margaret Mitchell, Simone Wu,
+                Andrew Zaldivar, Parker Barnes, Lucy Vasserman, Ben Hutchinson, Elena Spitzer,
+                Inioluwa Deborah Raji and Timnit Gebru for the proposal behind model cards.
+            Link: https://arxiv.org/abs/1810.03993
+
+        Note:
+            A model card can be loaded and saved to disk.
+
+        Parameters:
+    """
+    def __init__(self, **kwargs):
+        # Recomended attributes from https://arxiv.org/abs/1810.03993 (see papers)
+        self.model_details = kwargs.pop('model_details', {})
+        self.intended_use = kwargs.pop('intended_use', {})
+        self.factors = kwargs.pop('factors', {})
+        self.metrics = kwargs.pop('metrics', {})
+        self.evaluation_data = kwargs.pop('evaluation_data', {})
+        self.training_data = kwargs.pop('training_data', {})
+        self.quantitative_analyses = kwargs.pop('quantitative_analyses', {})
+        self.ethical_considerations = kwargs.pop('ethical_considerations', {})
+        self.caveats_and_recommendations = kwargs.pop('caveats_and_recommendations', {})
+
+        # Open additional attributes
+        for key, value in kwargs.items():
+            try:
+                setattr(self, key, value)
+            except AttributeError as err:
+                logger.error("Can't set {} with value {} for {}".format(key, value, self))
+                raise err
+
+    def save_pretrained(self, save_directory):
+        """ Save a model card object to the directory `save_directory`, so that it
+            can be re-loaded using the :func:`~transformers.ModelCard.from_pretrained` class method.
+        """
+        assert os.path.isdir(save_directory), "Saving path should be a directory where the model card can be saved"
+
+        # If we save using the predefined names, we can load using `from_pretrained`
+        output_model_card_file = os.path.join(save_directory, MODEL_CARD_NAME)
+
+        self.to_json_file(output_model_card_file)
+        logger.info("Model card saved in {}".format(output_model_card_file))
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r""" Instantiate a :class:`~transformers.ModelCard` from a pre-trained model model card.
+
+        Parameters:
+            pretrained_model_name_or_path: either:
+
+                - a string with the `shortcut name` of a pre-trained model card to load from cache or download, e.g.: ``bert-base-uncased``.
+                - a string with the `identifier name` of a pre-trained model card that was user-uploaded to our S3, e.g.: ``dbmdz/bert-base-german-cased``.
+                - a path to a `directory` containing a mode card file saved using the :func:`~transformers.ModelCard.save_pretrained` method, e.g.: ``./my_model_directory/``.
+                - a path or url to a saved model card JSON `file`, e.g.: ``./my_model_directory/model_card.json``.
+
+            cache_dir: (`optional`) string:
+                Path to a directory in which a downloaded pre-trained model
+                card should be cached if the standard cache should not be used.
+
+            kwargs: (`optional`) dict: key/value pairs with which to update the ModelCard object after loading.
+
+                - The values in kwargs of any keys which are model card attributes will be used to override the loaded values.
+                - Behavior concerning key/value pairs whose keys are *not* model card attributes is controlled by the `return_unused_kwargs` keyword parameter.
+
+            force_download: (`optional`) boolean, default False:
+                Force to (re-)download the model card file and override the cached version if it exists.
+
+            resume_download: (`optional`) boolean, default False:
+                Do not delete incompletely recieved file. Attempt to resume the download if such a file exists.
+
+            proxies: (`optional`) dict, default None:
+                A dictionary of proxy servers to use by protocol or endpoint, e.g.: {'http': 'foo.bar:3128', 'http://hostname': 'foo.bar:4012'}.
+                The proxies are used on each request.
+
+            return_unused_kwargs: (`optional`) bool:
+
+                - If False, then this function returns just the final model card object.
+                - If True, then this functions returns a tuple `(model card, unused_kwargs)` where `unused_kwargs` is a dictionary consisting of the key/value pairs whose keys are not model card attributes: ie the part of kwargs which has not been used to update `ModelCard` and is otherwise ignored.
+
+        Examples::
+
+            model_card = ModelCard.from_pretrained('bert-base-uncased')    # Download model card from S3 and cache.
+            model_card = ModelCard.from_pretrained('./test/saved_model/')  # E.g. model card was saved using `save_pretrained('./test/saved_model/')`
+            model_card = ModelCard.from_pretrained('./test/saved_model/model_card.json')
+            model_card = ModelCard.from_pretrained('bert-base-uncased', output_attention=True, foo=False)
+
+        """
+        cache_dir = kwargs.pop('cache_dir', None)
+        force_download = kwargs.pop('force_download', False)
+        resume_download = kwargs.pop('resume_download', False)
+        proxies = kwargs.pop('proxies', None)
+        return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
+
+        if pretrained_model_name_or_path in ALL_MODELS_MAP:
+            model_card_file = ALL_MODELS_MAP[pretrained_model_name_or_path]
+            model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)  # For simplicity we use the same pretrained url than config but with a different suffix
+        elif os.path.isdir(pretrained_model_name_or_path):
+            model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME)
+        elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
+            model_card_file = pretrained_model_name_or_path
+        else:
+            model_card_file = hf_bucket_url(pretrained_model_name_or_path, postfix=MODEL_CARD_NAME)
+        # redirect to the cache, if necessary
+        try:
+            resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=force_download,
+                                               proxies=proxies, resume_download=resume_download)
+
+            if resolved_model_card_file == model_card_file:
+                logger.info("loading model card file {}".format(model_card_file))
+            else:
+                logger.info("loading model card file {} from cache at {}".format(
+                    model_card_file, resolved_model_card_file))
+
+            # Load model card
+            model_card = cls.from_json_file(resolved_model_card_file)
+
+        except EnvironmentError:
+            if pretrained_model_name_or_path in ALL_MODELS_MAP:
+                logger.warning("Couldn't reach server at '{}' to download model card file.".format(
+                        model_card_file))
+            else:
+                logger.warning("Model name '{}' was not found in model name list ({}). " \
+                      "We assumed '{}' was a path or url to a model card file named {} or " \
+                      "a directory containing such a file but couldn't find any such file at this path or url.".format(
+                        pretrained_model_name_or_path,
+                        ', '.join(ALL_MODELS_MAP.keys()),
+                        model_card_file, MODEL_CARD_NAME))
+
+            logger.warning("Creating an empty model card.")
+
+            # We fall back on creating an empty model card
+            model_card = cls()
+
+        # Update model card with kwargs if needed
+        to_remove = []
+        for key, value in kwargs.items():
+            if hasattr(model_card, key):
+                setattr(model_card, key, value)
+                to_remove.append(key)
+        for key in to_remove:
+            kwargs.pop(key, None)
+
+        logger.info("Model card: %s", str(model_card))
+        if return_unused_kwargs:
+            return model_card, kwargs
+        else:
+            return model_card
+
+    @classmethod
+    def from_dict(cls, json_object):
+        """Constructs a `ModelCard` from a Python dictionary of parameters."""
+        return cls(**json_object)
+
+    @classmethod
+    def from_json_file(cls, json_file):
+        """Constructs a `ModelCard` from a json file of parameters."""
+        with open(json_file, "r", encoding='utf-8') as reader:
+            text = reader.read()
+        dict_obj = json.loads(text)
+        return cls(**dict_obj)
+
+    def __eq__(self, other):
+        return self.__dict__ == other.__dict__
+
+    def __repr__(self):
+        return str(self.to_json_string())
+
+    def to_dict(self):
+        """Serializes this instance to a Python dictionary."""
+        output = copy.deepcopy(self.__dict__)
+        return output
+
+    def to_json_string(self):
+        """Serializes this instance to a JSON string."""
+        return json.dumps(self.to_dict(), indent=2, sort_keys=True) + "\n"
+
+    def to_json_file(self, json_file_path):
+        """ Save this instance to a json file."""
+        with open(json_file_path, "w", encoding='utf-8') as writer:
+            writer.write(self.to_json_string())
diff --git a/transformers/tests/model_card_test.py b/transformers/tests/model_card_test.py
new file mode 100644
index 0000000000..4364cbacec
--- /dev/null
+++ b/transformers/tests/model_card_test.py
@@ -0,0 +1,87 @@
+# coding=utf-8
+# Copyright 2019 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import absolute_import, division, print_function, unicode_literals
+
+import os
+import sys
+import json
+import tempfile
+import shutil
+import unittest
+
+from transformers.model_card import ModelCard
+from .tokenization_tests_commons import TemporaryDirectory
+
+class ModelCardTester(unittest.TestCase):
+
+    def setUp(self):
+        self.inputs_dict = {'model_details': {
+                                'Organization': 'testing',
+                                'Model date': 'today',
+                                'Model version': 'v2.1, Developed by Test Corp in 2019.',
+                                'Architecture': 'Convolutional Neural Network.',
+                                },
+                            'metrics': 'BLEU and ROUGE-1',
+                            'evaluation_data':{
+                                'Datasets':{
+                                    'BLEU': 'My-great-dataset-v1',
+                                    'ROUGE-1': 'My-short-dataset-v2.1',
+                                },
+                                'Preprocessing': 'See details on https://arxiv.org/pdf/1810.03993.pdf'
+                            },
+                            'training_data':{
+                                'Dataset': 'English Wikipedia dump dated 2018-12-01',
+                                'Preprocessing': 'Using SentencePiece vocabulary of size 52k tokens. See details on https://arxiv.org/pdf/1810.03993.pdf'
+                            },
+                            'quantitative_analyses': {
+                                'BLEU': 55.1,
+                                'ROUGE-1': 76,
+                            },
+                            }
+        self.tmpdirname = tempfile.mkdtemp()
+
+    def tearDown(self):
+        shutil.rmtree(self.tmpdirname)
+
+    def test_model_card_common_properties(self):
+        model_card = ModelCard.from_dict(self.inputs_dict)
+        self.assertTrue(hasattr(model_card, 'model_details'))
+        self.assertTrue(hasattr(model_card, 'intended_use'))
+        self.assertTrue(hasattr(model_card, 'factors'))
+        self.assertTrue(hasattr(model_card, 'metrics'))
+        self.assertTrue(hasattr(model_card, 'evaluation_data'))
+        self.assertTrue(hasattr(model_card, 'training_data'))
+        self.assertTrue(hasattr(model_card, 'quantitative_analyses'))
+        self.assertTrue(hasattr(model_card, 'ethical_considerations'))
+        self.assertTrue(hasattr(model_card, 'caveats_and_recommendations'))
+
+    def test_model_card_to_json_string(self):
+        model_card = ModelCard.from_dict(self.inputs_dict)
+        obj = json.loads(model_card.to_json_string())
+        for key, value in self.inputs_dict.items():
+            self.assertEqual(obj[key], value)
+
+    def test_model_card_to_json_file(self):
+        model_card_first = ModelCard.from_dict(self.inputs_dict)
+
+        with TemporaryDirectory() as tmpdirname:
+            filename = os.path.join(tmpdirname, u"model_card.json")
+            model_card_first.to_json_file(filename)
+            model_card_second = ModelCard.from_json_file(filename)
+
+        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
+
+if __name__ == "__main__":
+    unittest.main()

From d3418a94ff4256725a690bd9c8167489b6f593b8 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 16 Dec 2019 13:52:41 +0100
Subject: [PATCH 085/110] update tests

---
 .../tests/configuration_common_test.py        | 27 ++++++++++++-------
 transformers/tests/model_card_test.py         | 16 ++++++-----
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/transformers/tests/configuration_common_test.py b/transformers/tests/configuration_common_test.py
index 8ee751153c..376d110d3c 100644
--- a/transformers/tests/configuration_common_test.py
+++ b/transformers/tests/configuration_common_test.py
@@ -16,15 +16,12 @@ from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
 
-import copy
 import os
-import shutil
 import json
-import random
-import uuid
+import tempfile
 
 import unittest
-import logging
+from .tokenization_tests_commons import TemporaryDirectory
 
 
 class ConfigTester(object):
@@ -48,16 +45,28 @@ class ConfigTester(object):
 
     def create_and_test_config_to_json_file(self):
         config_first = self.config_class(**self.inputs_dict)
-        json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
-        config_first.to_json_file(json_file_path)
-        config_second = self.config_class.from_json_file(json_file_path)
-        os.remove(json_file_path)
+
+        with TemporaryDirectory() as tmpdirname:
+            json_file_path = os.path.join(tmpdirname, "config.json")
+            config_first.to_json_file(json_file_path)
+            config_second = self.config_class.from_json_file(json_file_path)
+
+        self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
+
+    def create_and_test_config_from_and_save_pretrained(self):
+        config_first = self.config_class(**self.inputs_dict)
+
+        with TemporaryDirectory() as tmpdirname:
+            config_first.save_pretrained(tmpdirname)
+            config_second = self.config_class.from_pretrained(tmpdirname)
+
         self.parent.assertEqual(config_second.to_dict(), config_first.to_dict())
 
     def run_common_tests(self):
         self.create_and_test_config_common_properties()
         self.create_and_test_config_to_json_string()
         self.create_and_test_config_to_json_file()
+        self.create_and_test_config_from_and_save_pretrained()
 
 if __name__ == "__main__":
     unittest.main()
\ No newline at end of file
diff --git a/transformers/tests/model_card_test.py b/transformers/tests/model_card_test.py
index 4364cbacec..e75716f0aa 100644
--- a/transformers/tests/model_card_test.py
+++ b/transformers/tests/model_card_test.py
@@ -15,10 +15,7 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import os
-import sys
 import json
-import tempfile
-import shutil
 import unittest
 
 from transformers.model_card import ModelCard
@@ -50,10 +47,6 @@ class ModelCardTester(unittest.TestCase):
                                 'ROUGE-1': 76,
                             },
                             }
-        self.tmpdirname = tempfile.mkdtemp()
-
-    def tearDown(self):
-        shutil.rmtree(self.tmpdirname)
 
     def test_model_card_common_properties(self):
         model_card = ModelCard.from_dict(self.inputs_dict)
@@ -83,5 +76,14 @@ class ModelCardTester(unittest.TestCase):
 
         self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
 
+    def test_model_card_from_and_save_pretrained(self):
+        model_card_first = ModelCard.from_dict(self.inputs_dict)
+
+        with TemporaryDirectory() as tmpdirname:
+            model_card_first.save_pretrained(tmpdirname)
+            model_card_second = ModelCard.from_pretrained(tmpdirname)
+
+        self.assertEqual(model_card_second.to_dict(), model_card_first.to_dict())
+
 if __name__ == "__main__":
     unittest.main()

From a4d07b983a6c1716b4d39cf3fed570562aebf3f7 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 16 Dec 2019 14:00:32 +0100
Subject: [PATCH 086/110] dict of all config and model files cc @LysandreJik

---
 transformers/__init__.py           |  6 ++---
 transformers/configuration_auto.py | 42 ++++++++++++++++++++---------
 transformers/model_card.py         | 43 +++++-------------------------
 transformers/modeling_auto.py      | 42 ++++++++++++++++++++---------
 transformers/modeling_tf_auto.py   | 38 +++++++++++++++++++-------
 5 files changed, 98 insertions(+), 73 deletions(-)

diff --git a/transformers/__init__.py b/transformers/__init__.py
index 15c167a5ce..0b343bed2b 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -55,7 +55,7 @@ from .tokenization_t5 import T5Tokenizer
 
 # Configurations
 from .configuration_utils import PretrainedConfig
-from .configuration_auto import AutoConfig
+from .configuration_auto import AutoConfig, ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
@@ -73,7 +73,7 @@ from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
 if is_torch_available():
     from .modeling_utils import (PreTrainedModel, prune_layer, Conv1D)
     from .modeling_auto import (AutoModel, AutoModelForSequenceClassification, AutoModelForQuestionAnswering,
-                                AutoModelWithLMHead)
+                                AutoModelWithLMHead, ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
 
     from .modeling_bert import (BertPreTrainedModel, BertModel, BertForPreTraining,
                                 BertForMaskedLM, BertForNextSentencePrediction,
@@ -131,7 +131,7 @@ if is_torch_available():
 if is_tf_available():
     from .modeling_tf_utils import TFPreTrainedModel, TFSharedEmbeddings, TFSequenceSummary, shape_list
     from .modeling_tf_auto import (TFAutoModel, TFAutoModelForSequenceClassification, TFAutoModelForQuestionAnswering,
-                                   TFAutoModelWithLMHead)
+                                   TFAutoModelWithLMHead, TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP)
 
     from .modeling_tf_bert import (TFBertPreTrainedModel, TFBertMainLayer, TFBertEmbeddings,
                                    TFBertModel, TFBertForPreTraining,
diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py
index 680c55fa54..9fe58f173a 100644
--- a/transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -18,22 +18,40 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-from .configuration_bert import BertConfig
-from .configuration_openai import OpenAIGPTConfig
-from .configuration_gpt2 import GPT2Config
-from .configuration_transfo_xl import TransfoXLConfig
-from .configuration_xlnet import XLNetConfig
-from .configuration_xlm import XLMConfig
-from .configuration_roberta import RobertaConfig
-from .configuration_distilbert import DistilBertConfig
-from .configuration_ctrl import CTRLConfig
-from .configuration_camembert import CamembertConfig
-from .configuration_albert import AlbertConfig
-from .configuration_t5 import T5Config
+from .configuration_bert import BertConfig, BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_openai import OpenAIGPTConfig, OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_transfo_xl import TransfoXLConfig, TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_gpt2 import GPT2Config, GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_ctrl import CTRLConfig, CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlnet import XLNetConfig, XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlm import XLMConfig, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_roberta import RobertaConfig, ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
 
 logger = logging.getLogger(__name__)
 
 
+ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value)
+    for pretrained_map in [
+        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        ]
+    for key, value, in pretrained_map.items())
+
+
 class AutoConfig(object):
     r""":class:`~transformers.AutoConfig` is a generic configuration class
         that will be instantiated as one of the configuration classes of the library
diff --git a/transformers/model_card.py b/transformers/model_card.py
index 679c24872a..6d56089844 100644
--- a/transformers/model_card.py
+++ b/transformers/model_card.py
@@ -21,21 +21,9 @@ import copy
 import json
 import logging
 import os
-import re
 from io import open
 
-from .configuration_bert import BERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_openai import OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_transfo_xl import TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_gpt2 import GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_ctrl import CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlnet import XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_xlm import XLM_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_distilbert import DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_albert import ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_camembert import CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
-from .configuration_t5 import T5_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_auto import ALL_PRETRAINED_CONFIG_ARCHIVE_MAP
 
 from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, cached_path, is_remote_url, hf_bucket_url
 
@@ -43,24 +31,6 @@ from .file_utils import CONFIG_NAME, MODEL_CARD_NAME, cached_path, is_remote_url
 logger = logging.getLogger(__name__)
 
 
-ALL_MODELS_MAP = dict((key, value)
-    for pretrained_map in [
-        BERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        OPENAI_GPT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        TRANSFO_XL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        GPT2_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        CTRL_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        XLNET_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        DISTILBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
-        ]
-    for key, value, in pretrained_map.items())
-
-
 class ModelCard(object):
     r""" Model Card class.
         Store model card as well as methods for loading/downloading/saving model cards.
@@ -159,9 +129,10 @@ class ModelCard(object):
         proxies = kwargs.pop('proxies', None)
         return_unused_kwargs = kwargs.pop('return_unused_kwargs', False)
 
-        if pretrained_model_name_or_path in ALL_MODELS_MAP:
-            model_card_file = ALL_MODELS_MAP[pretrained_model_name_or_path]
-            model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)  # For simplicity we use the same pretrained url than config but with a different suffix
+        if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
+            # For simplicity we use the same pretrained url than the configuration files but with a different suffix (model_card.json)
+            model_card_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
+            model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)
         elif os.path.isdir(pretrained_model_name_or_path):
             model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME)
         elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
@@ -183,7 +154,7 @@ class ModelCard(object):
             model_card = cls.from_json_file(resolved_model_card_file)
 
         except EnvironmentError:
-            if pretrained_model_name_or_path in ALL_MODELS_MAP:
+            if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
                 logger.warning("Couldn't reach server at '{}' to download model card file.".format(
                         model_card_file))
             else:
@@ -191,7 +162,7 @@ class ModelCard(object):
                       "We assumed '{}' was a path or url to a model card file named {} or " \
                       "a directory containing such a file but couldn't find any such file at this path or url.".format(
                         pretrained_model_name_or_path,
-                        ', '.join(ALL_MODELS_MAP.keys()),
+                        ', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
                         model_card_file, MODEL_CARD_NAME))
 
             logger.warning("Creating an empty model card.")
diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py
index 19a54cca86..1a30ea4623 100644
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -18,18 +18,18 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering
-from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel
-from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel
-from .modeling_ctrl import CTRLModel, CTRLLMHeadModel
-from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel
-from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering
-from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering
-from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification
-from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification
-from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice
-from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering
-from .modeling_t5 import T5Model, T5WithLMHeadModel
+from .modeling_bert import BertModel, BertForMaskedLM, BertForSequenceClassification, BertForQuestionAnswering, BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_openai import OpenAIGPTModel, OpenAIGPTLMHeadModel, OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_gpt2 import GPT2Model, GPT2LMHeadModel, GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_ctrl import CTRLModel, CTRLLMHeadModel, CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_transfo_xl import TransfoXLModel, TransfoXLLMHeadModel, TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlnet import XLNetModel, XLNetLMHeadModel, XLNetForSequenceClassification, XLNetForQuestionAnswering, XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlm import XLMModel, XLMWithLMHeadModel, XLMForSequenceClassification, XLMForQuestionAnswering, XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering, DistilBertForMaskedLM, DistilBertForSequenceClassification, DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_utils import PreTrainedModel, SequenceSummary
 
@@ -38,6 +38,24 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 
+ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
+    for pretrained_map in [
+        BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ]
+    for key, value, in pretrained_map.items())
+
+
 class AutoModel(object):
     r"""
         :class:`~transformers.AutoModel` is a generic model class
diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py
index b4ff660098..9c687d9235 100644
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -18,22 +18,40 @@ from __future__ import absolute_import, division, print_function, unicode_litera
 
 import logging
 
-from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering
-from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel
-from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel
-from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel
-from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForQuestionAnsweringSimple
-from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple
-from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification
-from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification
-from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel
-from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel
+from .modeling_tf_bert import TFBertModel, TFBertForMaskedLM, TFBertForSequenceClassification, TFBertForQuestionAnswering, TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_openai import TFOpenAIGPTModel, TFOpenAIGPTLMHeadModel, TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_gpt2 import TFGPT2Model, TFGPT2LMHeadModel, TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_transfo_xl import TFTransfoXLModel, TFTransfoXLLMHeadModel, TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_xlnet import TFXLNetModel, TFXLNetLMHeadModel, TFXLNetForSequenceClassification, TFXLNetForQuestionAnsweringSimple, TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceClassification, TFXLMForQuestionAnsweringSimple, TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .file_utils import add_start_docstrings
 
 logger = logging.getLogger(__name__)
 
 
+TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
+    for pretrained_map in [
+        TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_OPENAI_GPT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_GPT2_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_XLNET_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_XLM_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
+        TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+        ]
+    for key, value, in pretrained_map.items())
+
+
 class TFAutoModel(object):
     r"""
         :class:`~transformers.TFAutoModel` is a generic model class

From db0a9ee6e0ddcb9d634c3ab0ba3d25501c370d8c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 16 Dec 2019 14:08:08 +0100
Subject: [PATCH 087/110] adding albert to TF auto models cc @LysandreJik

---
 transformers/modeling_tf_auto.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/transformers/modeling_tf_auto.py b/transformers/modeling_tf_auto.py
index 9c687d9235..3e9b4d120b 100644
--- a/transformers/modeling_tf_auto.py
+++ b/transformers/modeling_tf_auto.py
@@ -27,6 +27,7 @@ from .modeling_tf_xlm import TFXLMModel, TFXLMWithLMHeadModel, TFXLMForSequenceC
 from .modeling_tf_roberta import TFRobertaModel, TFRobertaForMaskedLM, TFRobertaForSequenceClassification, TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_tf_distilbert import TFDistilBertModel, TFDistilBertForQuestionAnswering, TFDistilBertForMaskedLM, TFDistilBertForSequenceClassification, TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_tf_ctrl import TFCTRLModel, TFCTRLLMHeadModel, TF_CTRL_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_tf_albert import TFAlbertModel, TFAlbertForMaskedLM, TFAlbertForSequenceClassification, TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_tf_t5 import TFT5Model, TFT5WithLMHeadModel, TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .file_utils import add_start_docstrings
@@ -46,7 +47,6 @@ TF_ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
         TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
         TF_DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
-        TF_CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         TF_T5_PRETRAINED_MODEL_ARCHIVE_MAP,
         ]
     for key, value, in pretrained_map.items())
@@ -162,6 +162,8 @@ class TFAutoModel(object):
             return TFT5Model.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'distilbert' in pretrained_model_name_or_path:
             return TFDistilBertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return TFAlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return TFRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
@@ -298,6 +300,8 @@ class TFAutoModelWithLMHead(object):
             return TFT5WithLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'distilbert' in pretrained_model_name_or_path:
             return TFDistilBertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return TFAlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return TFRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
@@ -425,6 +429,8 @@ class TFAutoModelForSequenceClassification(object):
         """
         if 'distilbert' in pretrained_model_name_or_path:
             return TFDistilBertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'albert' in pretrained_model_name_or_path:
+            return TFAlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return TFRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:

From 031ad4eb3780437d5232392b16891078b1b32d2c Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 16 Dec 2019 14:20:57 +0100
Subject: [PATCH 088/110] improving JSON error messages (for model card and
 configurations)

---
 transformers/configuration_utils.py | 15 +++++++++++----
 transformers/model_card.py          | 12 ++++++++----
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/transformers/configuration_utils.py b/transformers/configuration_utils.py
index 6c9eeea175..f692c9b132 100644
--- a/transformers/configuration_utils.py
+++ b/transformers/configuration_utils.py
@@ -151,10 +151,14 @@ class PretrainedConfig(object):
             config_file = pretrained_model_name_or_path
         else:
             config_file = hf_bucket_url(pretrained_model_name_or_path, postfix=CONFIG_NAME)
-        # redirect to the cache, if necessary
+
         try:
+            # Load from URL or cache if already cached
             resolved_config_file = cached_path(config_file, cache_dir=cache_dir, force_download=force_download,
                                                proxies=proxies, resume_download=resume_download)
+            # Load config
+            config = cls.from_json_file(resolved_config_file)
+
         except EnvironmentError:
             if pretrained_model_name_or_path in cls.pretrained_config_archive_map:
                 msg = "Couldn't reach server at '{}' to download pretrained model configuration file.".format(
@@ -168,15 +172,18 @@ class PretrainedConfig(object):
                         config_file, CONFIG_NAME)
             raise EnvironmentError(msg)
 
+        except json.JSONDecodeError:
+            msg = "Couldn't reach server at '{}' to download configuration file or " \
+                  "configuration file is not a valid JSON file. " \
+                  "Please check network or file content here: {}.".format(config_file, resolved_config_file)
+            raise EnvironmentError(msg)
+
         if resolved_config_file == config_file:
             logger.info("loading configuration file {}".format(config_file))
         else:
             logger.info("loading configuration file {} from cache at {}".format(
                 config_file, resolved_config_file))
 
-        # Load config
-        config = cls.from_json_file(resolved_config_file)
-
         if hasattr(config, 'pruned_heads'):
             config.pruned_heads = dict((int(key), value) for key, value in config.pruned_heads.items())
 
diff --git a/transformers/model_card.py b/transformers/model_card.py
index 6d56089844..3c775ab7fc 100644
--- a/transformers/model_card.py
+++ b/transformers/model_card.py
@@ -132,7 +132,7 @@ class ModelCard(object):
         if pretrained_model_name_or_path in ALL_PRETRAINED_CONFIG_ARCHIVE_MAP:
             # For simplicity we use the same pretrained url than the configuration files but with a different suffix (model_card.json)
             model_card_file = ALL_PRETRAINED_CONFIG_ARCHIVE_MAP[pretrained_model_name_or_path]
-            model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)
+            model_card_file = model_card_file.replace(CONFIG_NAME, MODEL_CARD_NAME)
         elif os.path.isdir(pretrained_model_name_or_path):
             model_card_file = os.path.join(pretrained_model_name_or_path, MODEL_CARD_NAME)
         elif os.path.isfile(pretrained_model_name_or_path) or is_remote_url(pretrained_model_name_or_path):
@@ -143,13 +143,11 @@ class ModelCard(object):
         try:
             resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=force_download,
                                                proxies=proxies, resume_download=resume_download)
-
             if resolved_model_card_file == model_card_file:
                 logger.info("loading model card file {}".format(model_card_file))
             else:
                 logger.info("loading model card file {} from cache at {}".format(
                     model_card_file, resolved_model_card_file))
-
             # Load model card
             model_card = cls.from_json_file(resolved_model_card_file)
 
@@ -164,9 +162,15 @@ class ModelCard(object):
                         pretrained_model_name_or_path,
                         ', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
                         model_card_file, MODEL_CARD_NAME))
-
             logger.warning("Creating an empty model card.")
+            # We fall back on creating an empty model card
+            model_card = cls()
 
+        except json.JSONDecodeError:
+            logger.warning("Couldn't reach server at '{}' to download model card file or "
+                           "model card file is not a valid JSON file. "
+                           "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file))
+            logger.warning("Creating an empty model card.")
             # We fall back on creating an empty model card
             model_card = cls()
 

From 1bbdbacd5bc7281dbcebfe4330a464a7ad1a6e72 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 16 Dec 2019 14:38:20 +0100
Subject: [PATCH 089/110] update __init__ and saving

---
 transformers/__init__.py   |  2 +-
 transformers/model_card.py | 19 +++++++++++--------
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/transformers/__init__.py b/transformers/__init__.py
index 0b343bed2b..44447c5495 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -19,7 +19,7 @@ logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 # Files and general utilities
 from .file_utils import (TRANSFORMERS_CACHE, PYTORCH_TRANSFORMERS_CACHE, PYTORCH_PRETRAINED_BERT_CACHE,
                          cached_path, add_start_docstrings, add_end_docstrings,
-                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME,
+                         WEIGHTS_NAME, TF2_WEIGHTS_NAME, TF_WEIGHTS_NAME, CONFIG_NAME, MODEL_CARD_NAME,
                          is_tf_available, is_torch_available)
 
 from .data import (is_sklearn_available,
diff --git a/transformers/model_card.py b/transformers/model_card.py
index 3c775ab7fc..baec7e8622 100644
--- a/transformers/model_card.py
+++ b/transformers/model_card.py
@@ -67,14 +67,14 @@ class ModelCard(object):
                 logger.error("Can't set {} with value {} for {}".format(key, value, self))
                 raise err
 
-    def save_pretrained(self, save_directory):
-        """ Save a model card object to the directory `save_directory`, so that it
-            can be re-loaded using the :func:`~transformers.ModelCard.from_pretrained` class method.
+    def save_pretrained(self, save_directory_or_file):
+        """ Save a model card object to the directory or file `save_directory_or_file`.
         """
-        assert os.path.isdir(save_directory), "Saving path should be a directory where the model card can be saved"
-
-        # If we save using the predefined names, we can load using `from_pretrained`
-        output_model_card_file = os.path.join(save_directory, MODEL_CARD_NAME)
+        if os.path.isdir(save_directory_or_file):
+            # If we save using the predefined names, we can load using `from_pretrained`
+            output_model_card_file = os.path.join(save_directory_or_file, MODEL_CARD_NAME)
+        else:
+            output_model_card_file = save_directory_or_file
 
         self.to_json_file(output_model_card_file)
         logger.info("Model card saved in {}".format(output_model_card_file))
@@ -139,8 +139,9 @@ class ModelCard(object):
             model_card_file = pretrained_model_name_or_path
         else:
             model_card_file = hf_bucket_url(pretrained_model_name_or_path, postfix=MODEL_CARD_NAME)
-        # redirect to the cache, if necessary
+
         try:
+            # Load from URL or cache if already cached
             resolved_model_card_file = cached_path(model_card_file, cache_dir=cache_dir, force_download=force_download,
                                                proxies=proxies, resume_download=resume_download)
             if resolved_model_card_file == model_card_file:
@@ -163,6 +164,7 @@ class ModelCard(object):
                         ', '.join(ALL_PRETRAINED_CONFIG_ARCHIVE_MAP.keys()),
                         model_card_file, MODEL_CARD_NAME))
             logger.warning("Creating an empty model card.")
+
             # We fall back on creating an empty model card
             model_card = cls()
 
@@ -171,6 +173,7 @@ class ModelCard(object):
                            "model card file is not a valid JSON file. "
                            "Please check network or file content here: {}.".format(model_card_file, resolved_model_card_file))
             logger.warning("Creating an empty model card.")
+
             # We fall back on creating an empty model card
             model_card = cls()
 

From 855ff0e91d8b3bd75a3b1c1316e2efd814373764 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Mon, 16 Dec 2019 12:42:22 -0500
Subject: [PATCH 090/110] [doc] Model upload and sharing

ping @lysandrejik @thomwolf

Is this clear enough? Anything we should add?
---
 README.md                    | 41 ++++++++++++++++++++++++++++++++++++
 docs/source/index.rst        |  1 +
 docs/source/model_sharing.md | 40 +++++++++++++++++++++++++++++++++++
 3 files changed, 82 insertions(+)
 create mode 100644 docs/source/model_sharing.md

diff --git a/README.md b/README.md
index 214f61cc0c..a5ae74a9ae 100644
--- a/README.md
+++ b/README.md
@@ -56,6 +56,7 @@ Choose the right framework for every part of a model's lifetime
 | [Quick tour: Usage](#quick-tour) | Tokenizers & models usage: Bert and GPT-2 |
 | [Quick tour: TF 2.0 and PyTorch ](#Quick-tour-TF-20-training-and-PyTorch-interoperability) | Train a TF 2.0 model in 10 lines of code, load it in PyTorch |
 | [Quick tour: Fine-tuning/usage scripts](#quick-tour-of-the-fine-tuningusage-scripts) | Using provided scripts: GLUE, SQuAD and Text generation |
+| [Quick tour: Share your models ](#Quick-tour-of-model-sharing) | Upload and share your fine-tuned models with the community |
 | [Migrating from pytorch-transformers to transformers](#Migrating-from-pytorch-transformers-to-transformers) | Migrating your code from pytorch-transformers to transformers |
 | [Migrating from pytorch-pretrained-bert to pytorch-transformers](#Migrating-from-pytorch-pretrained-bert-to-transformers) | Migrating your code from pytorch-pretrained-bert to transformers |
 | [Documentation][(v2.2.0/v2.2.1/v2.2.2)](https://huggingface.co/transformers/v2.2.0) [(v2.1.1)](https://huggingface.co/transformers/v2.1.1) [(v2.0.0)](https://huggingface.co/transformers/v2.0.0) [(v1.2.0)](https://huggingface.co/transformers/v1.2.0) [(v1.1.0)](https://huggingface.co/transformers/v1.1.0) [(v1.0.0)](https://huggingface.co/transformers/v1.0.0) [(master)](https://huggingface.co/transformers) | Full API documentation and more |
@@ -446,6 +447,46 @@ python ./examples/run_generation.py \
     --repetition_penalty=1.2 \
 ```
 
+## Quick tour of model sharing
+
+New in `v2.2.2`: you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
+
+**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
+
+```shell
+transformers-cli login
+# log in using the same credentials as on huggingface.co
+```
+Upload your model:
+```shell
+transformers-cli upload ./path/to/pretrained_model/
+
+# ^^ Upload folder containing weights/tokenizer/config
+# saved via `.save_pretrained()`
+
+transformers-cli upload ./config.json [--filename foobar.json]
+
+# ^^ Upload a single file
+# (you can optionally override its filename)
+```
+
+Your model will then be accessible through its identifier:
+```python
+"username/model_name"
+```
+
+Anyone can load it from code:
+```python
+tokenizer = AutoTokenizer.from_pretrained("username/model_name")
+model = AutoModel.from_pretrained("username/model_name")
+```
+
+Finally, list all your files on S3:
+```shell
+transformers-cli ls
+# List all your S3 objects.
+```
+
 ## Migrating from pytorch-transformers to transformers
 
 Here is a quick summary of what you should take care of when migrating from `pytorch-transformers` to `transformers`.
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 84012fc6cf..48282c1c6c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -58,6 +58,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
     installation
     quickstart
     pretrained_models
+    model_sharing
     examples
     notebooks
     serialization
diff --git a/docs/source/model_sharing.md b/docs/source/model_sharing.md
new file mode 100644
index 0000000000..b9c722b10f
--- /dev/null
+++ b/docs/source/model_sharing.md
@@ -0,0 +1,40 @@
+# Model upload and sharing
+
+Starting with `v2.2.2`, you can now upload and share your fine-tuned models with the community, using the <abbr title="Command-line interface">CLI</abbr> that's built-in to the library.
+
+**First, create an account on [https://huggingface.co/join](https://huggingface.co/join)**. Then:
+
+```shell
+transformers-cli login
+# log in using the same credentials as on huggingface.co
+```
+Upload your model:
+```shell
+transformers-cli upload ./path/to/pretrained_model/
+
+# ^^ Upload folder containing weights/tokenizer/config
+# saved via `.save_pretrained()`
+
+transformers-cli upload ./config.json [--filename foobar.json]
+
+# ^^ Upload a single file
+# (you can optionally override its filename)
+```
+
+Your model will then be accessible through its identifier:
+```python
+"username/model_name"
+```
+
+Anyone can load it from code:
+```python
+tokenizer = AutoTokenizer.from_pretrained("username/model_name")
+model = AutoModel.from_pretrained("username/model_name")
+```
+
+Finally, list all your files on S3:
+```shell
+transformers-cli ls
+# List all your S3 objects.
+```
+

From d8034092153a6850052862f154a398b88b8ba4e5 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 16 Dec 2019 16:31:38 -0500
Subject: [PATCH 091/110] Fix run squad evaluate during training

---
 examples/run_squad.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index a39915ee8b..34c31c3bb8 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -223,7 +223,7 @@ def evaluate(args, model, tokenizer, prefix=""):
     eval_dataloader = DataLoader(dataset, sampler=eval_sampler, batch_size=args.eval_batch_size)
 
     # multi-gpu evaluate
-    if args.n_gpu > 1:
+    if args.n_gpu > 1 and not isinstance(model, torch.nn.DataParallel):
         model = torch.nn.DataParallel(model)
 
     # Eval!

From 18a879f47576822aa1a5c49aecb27d89bfa5fa69 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 16 Dec 2019 16:44:29 -0500
Subject: [PATCH 092/110] fix #2180

---
 examples/run_generation.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/examples/run_generation.py b/examples/run_generation.py
index 2d917660cf..fa52905b7e 100644
--- a/examples/run_generation.py
+++ b/examples/run_generation.py
@@ -247,7 +247,11 @@ def main():
         out = out[:, len(context_tokens):].tolist()
         for o in out:
             text = tokenizer.decode(o, clean_up_tokenization_spaces=True)
-            text = text[: text.find(args.stop_token) if args.stop_token else None]
+            if args.stop_token:
+                index =  text.find(args.stop_token)
+                if index == -1:
+                    index = None
+                text = text[:index]
 
             print(text)
 

From 3cb51299c371f67b4da40b89c59c63e9405591f0 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Mon, 16 Dec 2019 22:32:05 +0100
Subject: [PATCH 093/110] Fix #2109

---
 transformers/modeling_tf_pytorch_utils.py | 13 +++++++++++--
 transformers/modeling_tf_utils.py         |  2 +-
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/transformers/modeling_tf_pytorch_utils.py b/transformers/modeling_tf_pytorch_utils.py
index 9d2b663dcb..d885fd23b3 100644
--- a/transformers/modeling_tf_pytorch_utils.py
+++ b/transformers/modeling_tf_pytorch_utils.py
@@ -143,7 +143,11 @@ def load_pytorch_weights_in_tf2_model(tf_model, pt_state_dict, tf_inputs=None, a
         name, transpose = convert_tf_weight_name_to_pt_weight_name(sw_name, start_prefix_to_remove=start_prefix_to_remove)
 
         # Find associated numpy array in pytorch model state dict
-        assert name in pt_state_dict, "{} not found in PyTorch model".format(name)
+        if name not in pt_state_dict:
+            if allow_missing_keys:
+                continue
+            raise AttributeError("{} not found in PyTorch model".format(name))
+
         array = pt_state_dict[name].numpy()
 
         if transpose:
@@ -250,6 +254,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
 
     all_tf_weights = set(list(tf_weights_map.keys()))
     loaded_pt_weights_data_ptr = {}
+    missing_keys_pt = []
     for pt_weight_name, pt_weight in current_pt_params_dict.items():
         # Handle PyTorch shared weight ()not duplicated in TF 2.0
         if pt_weight.data_ptr() in loaded_pt_weights_data_ptr:
@@ -258,7 +263,10 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
 
         # Find associated numpy array in pytorch model state dict
         if pt_weight_name not in tf_weights_map:
-            raise ValueError("{} not found in TF 2.0 model".format(pt_weight_name))
+            if allow_missing_keys:
+                missing_keys_pt.append(pt_weight_name)
+                continue
+            raise AttributeError("{} not found in TF 2.0 model".format(pt_weight_name))
 
         array, transpose = tf_weights_map[pt_weight_name]
 
@@ -283,6 +291,7 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
         all_tf_weights.discard(pt_weight_name)
 
     missing_keys, unexpected_keys = pt_model.load_state_dict(new_pt_params_dict, strict=False)
+    missing_keys += missing_keys_pt
 
     if len(missing_keys) > 0:
         logger.info("Weights of {} not initialized from TF 2.0 model: {}".format(
diff --git a/transformers/modeling_tf_utils.py b/transformers/modeling_tf_utils.py
index 6fb4850b05..6bbec71cdf 100644
--- a/transformers/modeling_tf_utils.py
+++ b/transformers/modeling_tf_utils.py
@@ -297,7 +297,7 @@ class TFPreTrainedModel(tf.keras.Model):
 
         if from_pt:
             # Load from a PyTorch checkpoint
-            return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file)
+            return load_pytorch_checkpoint_in_tf2_model(model, resolved_archive_file, allow_missing_keys=True)
 
         ret = model(model.dummy_inputs, training=False)  # build the network with dummy inputs
 

From 3f5ccb183e3cfa755dea2dd2afd9abbf1a0f93b8 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Mon, 16 Dec 2019 18:20:23 -0500
Subject: [PATCH 094/110] [doc] Clarify uploads

cf https://github.com/huggingface/transformers/commit/855ff0e91d8b3bd75a3b1c1316e2efd814373764#commitcomment-36452545
---
 README.md                    | 10 +++++-----
 docs/source/model_sharing.md | 12 ++++++------
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/README.md b/README.md
index a5ae74a9ae..c33a65bdbb 100644
--- a/README.md
+++ b/README.md
@@ -464,21 +464,21 @@ transformers-cli upload ./path/to/pretrained_model/
 # ^^ Upload folder containing weights/tokenizer/config
 # saved via `.save_pretrained()`
 
-transformers-cli upload ./config.json [--filename foobar.json]
+transformers-cli upload ./config.json [--filename folder/foobar.json]
 
 # ^^ Upload a single file
-# (you can optionally override its filename)
+# (you can optionally override its filename, which can be nested inside a folder)
 ```
 
-Your model will then be accessible through its identifier:
+Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
 ```python
 "username/model_name"
 ```
 
 Anyone can load it from code:
 ```python
-tokenizer = AutoTokenizer.from_pretrained("username/model_name")
-model = AutoModel.from_pretrained("username/model_name")
+tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
+model = AutoModel.from_pretrained("username/pretrained_model")
 ```
 
 Finally, list all your files on S3:
diff --git a/docs/source/model_sharing.md b/docs/source/model_sharing.md
index b9c722b10f..95baafb575 100644
--- a/docs/source/model_sharing.md
+++ b/docs/source/model_sharing.md
@@ -15,21 +15,21 @@ transformers-cli upload ./path/to/pretrained_model/
 # ^^ Upload folder containing weights/tokenizer/config
 # saved via `.save_pretrained()`
 
-transformers-cli upload ./config.json [--filename foobar.json]
+transformers-cli upload ./config.json [--filename folder/foobar.json]
 
 # ^^ Upload a single file
-# (you can optionally override its filename)
+# (you can optionally override its filename, which can be nested inside a folder)
 ```
 
-Your model will then be accessible through its identifier:
+Your model will then be accessible through its identifier, a concatenation of your username and the folder name above:
 ```python
-"username/model_name"
+"username/pretrained_model"
 ```
 
 Anyone can load it from code:
 ```python
-tokenizer = AutoTokenizer.from_pretrained("username/model_name")
-model = AutoModel.from_pretrained("username/model_name")
+tokenizer = AutoTokenizer.from_pretrained("username/pretrained_model")
+model = AutoModel.from_pretrained("username/pretrained_model")
 ```
 
 Finally, list all your files on S3:

From 2cff4bd8f3ad412917f4f295b97b952e297fa257 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 17 Dec 2019 14:01:04 -0500
Subject: [PATCH 095/110] Fix segmentation fault

---
 transformers/file_utils.py | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/transformers/file_utils.py b/transformers/file_utils.py
index 81c9b8002f..16010f7e0a 100644
--- a/transformers/file_utils.py
+++ b/transformers/file_utils.py
@@ -26,14 +26,6 @@ from contextlib import contextmanager
 
 logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
 
-try:
-    import tensorflow as tf
-    assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2
-    _tf_available = True  # pylint: disable=invalid-name
-    logger.info("TensorFlow version {} available.".format(tf.__version__))
-except (ImportError, AssertionError):
-    _tf_available = False  # pylint: disable=invalid-name
-
 try:
     import torch
     _torch_available = True  # pylint: disable=invalid-name
@@ -41,6 +33,13 @@ try:
 except ImportError:
     _torch_available = False  # pylint: disable=invalid-name
 
+try:
+    import tensorflow as tf
+    assert hasattr(tf, '__version__') and int(tf.__version__[0]) >= 2
+    _tf_available = True  # pylint: disable=invalid-name
+    logger.info("TensorFlow version {} available.".format(tf.__version__))
+except (ImportError, AssertionError):
+    _tf_available = False  # pylint: disable=invalid-name
 
 try:
     from torch.hub import _get_torch_home

From 5e289f69bc564c94132f77c89a34e5f1dd69a592 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Tue, 17 Dec 2019 14:17:11 -0500
Subject: [PATCH 096/110] regex 2019.12.17 install fails with Python 2

---
 requirements.txt | 2 +-
 setup.py         | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index 9c43abc6d7..32edee0712 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -5,7 +5,7 @@ boto3
 # Used for downloading models over HTTP
 requests
 # For OpenAI GPT
-regex
+regex != 2019.12.17
 # For XLNet
 sentencepiece
 # For XLM
diff --git a/setup.py b/setup.py
index eacb5ecec0..bf09a7d48a 100644
--- a/setup.py
+++ b/setup.py
@@ -59,7 +59,7 @@ setup(
                       'boto3',
                       'requests',
                       'tqdm',
-                      'regex',
+                      'regex != 2019.12.17',
                       'sentencepiece',
                       'sacremoses'],
     entry_points={

From a4df2e011367020253c8ca8a714c4b4855ff61bc Mon Sep 17 00:00:00 2001
From: Arman Cohan <armanc@allenai.org>
Date: Tue, 26 Nov 2019 16:03:07 -0800
Subject: [PATCH 097/110] update roberta conversion

- update to fix conversion for the updated fairseq model
- create save directory if not exist
---
 ..._original_pytorch_checkpoint_to_pytorch.py | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index b4dc1bb61b..be3460a86f 100644
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -20,6 +20,7 @@ import argparse
 import logging
 import numpy as np
 import torch
+import pathlib
 
 from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
@@ -79,15 +80,18 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         ### self attention
         self_attn: BertSelfAttention = layer.attention.self
         assert(
-            roberta_layer.self_attn.in_proj_weight.shape == torch.Size((3 * config.hidden_size, config.hidden_size))
+            roberta_layer.self_attn.k_proj.weight.data.shape == \
+            roberta_layer.self_attn.q_proj.weight.data.shape == \
+            roberta_layer.self_attn.v_proj.weight.data.shape == \
+            torch.Size((config.hidden_size, config.hidden_size))
         )
-        # we use three distinct linear layers so we split the source layer here.
-        self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[:config.hidden_size, :]
-        self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[:config.hidden_size]
-        self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[config.hidden_size:2*config.hidden_size, :]
-        self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[config.hidden_size:2*config.hidden_size]
-        self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[2*config.hidden_size:, :]
-        self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[2*config.hidden_size:]
+
+        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
+        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
+        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
+        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
+        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
+        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
 
         ### self-attention output
         self_output: BertSelfOutput = layer.attention.output
@@ -151,6 +155,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
     if not success:
         raise Exception("Something went wRoNg")
 
+    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
     print(f"Saving model to {pytorch_dump_folder_path}")
     model.save_pretrained(pytorch_dump_folder_path)
 

From ea636440d1ea3497785c2682c410da478f8b1841 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 17 Dec 2019 18:06:42 -0500
Subject: [PATCH 098/110] [roberta.conversion] Do not hardcode vocab size

and support for fairseq 0.9+
---
 ...t_roberta_original_pytorch_checkpoint_to_pytorch.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
index be3460a86f..fedfc1ecb8 100644
--- a/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -22,6 +22,12 @@ import numpy as np
 import torch
 import pathlib
 
+import fairseq
+from packaging import version
+
+if version.parse(fairseq.__version__) < version.parse("0.9.0"):
+    raise Exception("requires fairseq >= 0.9.0")
+
 from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
 from fairseq.modules import TransformerSentenceEncoderLayer
 from transformers.modeling_bert import (BertConfig, BertEncoder,
@@ -46,8 +52,9 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
     """
     roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
     roberta.eval()  # disable dropout
+    roberta_sent_encoder = roberta.model.decoder.sentence_encoder
     config = BertConfig(
-        vocab_size=50265,
+        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
         hidden_size=roberta.args.encoder_embed_dim,
         num_hidden_layers=roberta.args.encoder_layers,
         num_attention_heads=roberta.args.encoder_attention_heads,
@@ -65,7 +72,6 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
 
     # Now let's copy all the weights.
     # Embeddings
-    roberta_sent_encoder = roberta.model.decoder.sentence_encoder
     model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
     model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
     model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight)  # just zero them out b/c RoBERTa doesn't use them.

From a0d386455b347508ea31fc88dd06cc5555255c37 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 17 Dec 2019 20:07:39 -0500
Subject: [PATCH 099/110] Fix outdated tokenizer doc

---
 templates/adding_a_new_model/tokenization_xxx.py | 2 +-
 transformers/tokenization_bert.py                | 4 ++--
 transformers/tokenization_distilbert.py          | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/templates/adding_a_new_model/tokenization_xxx.py b/templates/adding_a_new_model/tokenization_xxx.py
index 3d6b4ad9df..7a10a41e5a 100644
--- a/templates/adding_a_new_model/tokenization_xxx.py
+++ b/templates/adding_a_new_model/tokenization_xxx.py
@@ -85,7 +85,7 @@ class XxxTokenizer(PreTrainedTokenizer):
 
     Args:
         vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py
index ded5072e58..7ab8029da8 100644
--- a/transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -113,12 +113,12 @@ class BertTokenizer(PreTrainedTokenizer):
 
     Args:
         vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
         do_basic_tokenize: Whether to do basic tokenization before wordpiece.
         max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
             minimum of this value (if specified) and the underlying BERT model's sequence length.
         never_split: List of tokens which will never be split during tokenization. Only has an effect when
-            do_wordpiece_only=False
+            do_basic_tokenize=True
     """
 
     vocab_files_names = VOCAB_FILES_NAMES
diff --git a/transformers/tokenization_distilbert.py b/transformers/tokenization_distilbert.py
index f40bf2bd77..2f245d71dc 100644
--- a/transformers/tokenization_distilbert.py
+++ b/transformers/tokenization_distilbert.py
@@ -53,12 +53,12 @@ class DistilBertTokenizer(BertTokenizer):
 
     Args:
         vocab_file: Path to a one-wordpiece-per-line vocabulary file
-        do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
+        do_lower_case: Whether to lower case the input. Only has an effect when do_basic_tokenize=True
         do_basic_tokenize: Whether to do basic tokenization before wordpiece.
         max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
             minimum of this value (if specified) and the underlying BERT model's sequence length.
         never_split: List of tokens which will never be split during tokenization. Only has an effect when
-            do_wordpiece_only=False
+            do_basic_tokenize=True
     """
 
     vocab_files_names = VOCAB_FILES_NAMES

From 8ac840ff8758fb242e3e89cbc809366165ccf960 Mon Sep 17 00:00:00 2001
From: Antti Virtanen <haama92@gmail.com>
Date: Mon, 16 Dec 2019 17:08:25 +0200
Subject: [PATCH 100/110] Adding Finnish BERT.

---
 transformers/configuration_bert.py | 4 +++-
 transformers/modeling_bert.py      | 4 +++-
 transformers/modeling_tf_bert.py   | 4 +++-
 transformers/tokenization_bert.py  | 6 ++++++
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py
index 9072820bce..b1974966a9 100644
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -45,7 +45,9 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-config.json",
     'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
     'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
-    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json"
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
+    'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/config.json",
+    'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/config.json",
 }
 
 
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index d0f35272ac..d0cb5ec617 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -51,7 +51,9 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-pytorch_model.bin",
     'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
     'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
-    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin"
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
+    'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/pytorch_model.bin",
+    'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/pytorch_model.bin",
 }
 
 
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index 7cc71f5063..20b5895dbd 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -51,7 +51,9 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-japanese': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-tf_model.h5",
     'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
     'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
-    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5"
+    'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
+    #'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/pytorch_model.bin",
+    #'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/pytorch_model.bin",
 }
 
 
diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py
index 7ab8029da8..6f6a4d6f19 100644
--- a/transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -46,6 +46,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
         'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
         'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
         'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
+        'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/vocab.txt",
+        'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/vocab.txt",
     }
 }
 
@@ -65,6 +67,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'bert-base-cased-finetuned-mrpc': 512,
     'bert-base-german-dbmdz-cased': 512,
     'bert-base-german-dbmdz-uncased': 512,
+    'bert-base-finnish-cased-v1': 512,
+    'bert-base-finnish-uncased-v1': 512,
 }
 
 PRETRAINED_INIT_CONFIGURATION = {
@@ -83,6 +87,8 @@ PRETRAINED_INIT_CONFIGURATION = {
     'bert-base-cased-finetuned-mrpc': {'do_lower_case': False},
     'bert-base-german-dbmdz-cased': {'do_lower_case': False},
     'bert-base-german-dbmdz-uncased': {'do_lower_case': True},
+    'bert-base-finnish-cased-v1': {'do_lower_case': False},
+    'bert-base-finnish-uncased-v1': {'do_lower_case': True},
 }
 
 

From abc43ffbfff69dc91f354c34f1c7c5b48a5c1502 Mon Sep 17 00:00:00 2001
From: Antti Virtanen <haama92@gmail.com>
Date: Mon, 16 Dec 2019 18:08:00 +0200
Subject: [PATCH 101/110] Add pretrained model documentation for FinBERT.

---
 docs/source/pretrained_models.rst | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index c6b990f213..7d037da34f 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -79,6 +79,14 @@ Here is the full list of the currently provided pretrained models together with
 |                   | ``bert-base-japanese-char-whole-word-masking``             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | Trained on Japanese text using Whole-Word-Masking. Text is tokenized into characters.                                               |
 |                   |                                                            | (see `details on cl-tohoku repository <https://github.com/cl-tohoku/bert-japanese>`__).                                               |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-finnish-cased-v1``                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on cased Finnish text.                                                                                                      |
+|                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-finnish-uncased-v1``                           | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on uncased Finnish text.                                                                                                    |
+|                   |                                                            | (see `details on turkunlp.org <http://turkunlp.org/FinBERT/>`__).                                                                     |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 | GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
 |                   |                                                            | | OpenAI GPT English model                                                                                                            |

From c5f35e61db8d7286173515071e76612e9e5f5ce5 Mon Sep 17 00:00:00 2001
From: Antti Virtanen <haama92@gmail.com>
Date: Mon, 16 Dec 2019 21:06:14 +0200
Subject: [PATCH 102/110] Uploaded files to AWS.

---
 transformers/configuration_bert.py | 4 ++--
 transformers/modeling_bert.py      | 4 ++--
 transformers/modeling_tf_bert.py   | 4 ++--
 transformers/tokenization_bert.py  | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py
index b1974966a9..c2ccc578c2 100644
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -46,8 +46,8 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
     'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
     'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
-    'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/config.json",
-    'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/config.json",
+    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-config.json",
+    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1-config.json",
 }
 
 
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index d0cb5ec617..4e034f4b6e 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -52,8 +52,8 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
     'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
     'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
-    'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/pytorch_model.bin",
-    'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/pytorch_model.bin",
+    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-pytorch_model.bin",
+    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1-pytorch_model.bin",
 }
 
 
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index 20b5895dbd..5a989c299f 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -52,8 +52,8 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
     'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
     'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
-    #'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/pytorch_model.bin",
-    #'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/pytorch_model.bin",
+    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-tf_model.h5",
+    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1-tf_model.h5",
 }
 
 
diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py
index 6f6a4d6f19..c11c1b4d3c 100644
--- a/transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -46,8 +46,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
         'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
         'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
         'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
-        'bert-base-finnish-cased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-cased-v1/vocab.txt",
-        'bert-base-finnish-uncased-v1': "http://dl.turkunlp.org/finbert/torch-transformers/bert-base-finnish-uncased-v1/vocab.txt",
+        'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-vocab.txt",
+        'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-vocab.txt",
     }
 }
 

From 7ffa8173905cb6d0819fc424a4806e81a44dd0e0 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Mon, 16 Dec 2019 18:55:14 -0500
Subject: [PATCH 103/110] [s3] mv files and update links

---
 transformers/configuration_bert.py | 4 ++--
 transformers/modeling_bert.py      | 4 ++--
 transformers/modeling_tf_bert.py   | 4 ++--
 transformers/tokenization_bert.py  | 4 ++--
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/transformers/configuration_bert.py b/transformers/configuration_bert.py
index c2ccc578c2..7b495013ff 100644
--- a/transformers/configuration_bert.py
+++ b/transformers/configuration_bert.py
@@ -46,8 +46,8 @@ BERT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-config.json",
     'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-config.json",
     'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-config.json",
-    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-config.json",
-    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1-config.json",
+    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/config.json",
+    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/config.json",
 }
 
 
diff --git a/transformers/modeling_bert.py b/transformers/modeling_bert.py
index 4e034f4b6e..afeb9d8e21 100644
--- a/transformers/modeling_bert.py
+++ b/transformers/modeling_bert.py
@@ -52,8 +52,8 @@ BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-pytorch_model.bin",
     'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-pytorch_model.bin",
     'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-pytorch_model.bin",
-    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-pytorch_model.bin",
-    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1-pytorch_model.bin",
+    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/pytorch_model.bin",
+    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/pytorch_model.bin",
 }
 
 
diff --git a/transformers/modeling_tf_bert.py b/transformers/modeling_tf_bert.py
index 5a989c299f..b4f97c06d9 100644
--- a/transformers/modeling_tf_bert.py
+++ b/transformers/modeling_tf_bert.py
@@ -52,8 +52,8 @@ TF_BERT_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'bert-base-japanese-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-whole-word-masking-tf_model.h5",
     'bert-base-japanese-char': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-tf_model.h5",
     'bert-base-japanese-char-whole-word-masking': "https://s3.amazonaws.com/models.huggingface.co/bert/cl-tohoku/bert-base-japanese-char-whole-word-masking-tf_model.h5",
-    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-tf_model.h5",
-    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1-tf_model.h5",
+    'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/tf_model.h5",
+    'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/tf_model.h5",
 }
 
 
diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py
index c11c1b4d3c..18b96c99b3 100644
--- a/transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -46,8 +46,8 @@ PRETRAINED_VOCAB_FILES_MAP = {
         'bert-base-cased-finetuned-mrpc': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-finetuned-mrpc-vocab.txt",
         'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
         'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
-        'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-vocab.txt",
-        'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1-vocab.txt",
+        'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
+        'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
     }
 }
 

From 94c99db34cf9074a212c36554fb925c513d70ab1 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 17 Dec 2019 20:34:22 -0500
Subject: [PATCH 104/110] [FinBERT] fix incorrect url

---
 transformers/tokenization_bert.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/tokenization_bert.py b/transformers/tokenization_bert.py
index 18b96c99b3..edc26d88cf 100644
--- a/transformers/tokenization_bert.py
+++ b/transformers/tokenization_bert.py
@@ -47,7 +47,7 @@ PRETRAINED_VOCAB_FILES_MAP = {
         'bert-base-german-dbmdz-cased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-cased-vocab.txt",
         'bert-base-german-dbmdz-uncased': "https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-german-dbmdz-uncased-vocab.txt",
         'bert-base-finnish-cased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
-        'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-cased-v1/vocab.txt",
+        'bert-base-finnish-uncased-v1': "https://s3.amazonaws.com/models.huggingface.co/bert/TurkuNLP/bert-base-finnish-uncased-v1/vocab.txt",
     }
 }
 

From 8efc6dd544bf1a30d99d4b5abfc5e214699eab2b Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Wed, 18 Dec 2019 10:47:59 -0500
Subject: [PATCH 105/110] fix #2214

---
 transformers/configuration_xlm.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/transformers/configuration_xlm.py b/transformers/configuration_xlm.py
index 0740cc4026..6839a45746 100644
--- a/transformers/configuration_xlm.py
+++ b/transformers/configuration_xlm.py
@@ -144,6 +144,9 @@ class XLMConfig(PretrainedConfig):
         self.start_n_top = start_n_top
         self.end_n_top = end_n_top
 
+        if "n_words" in kwargs:
+            self.n_words = kwargs["n_words"]
+
     @property
     def n_words(self):  # For backward compatibility
         return self.vocab_size

From ed6ba93912d223886fe0b88dd4ee58b20774beaf Mon Sep 17 00:00:00 2001
From: patrickvonplaten <patrick.v.platen@gmail.com>
Date: Thu, 19 Dec 2019 01:26:01 +0100
Subject: [PATCH 106/110] corrected typo in example for t5 model input argument

---
 transformers/modeling_t5.py    | 4 ++--
 transformers/modeling_tf_t5.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/transformers/modeling_t5.py b/transformers/modeling_t5.py
index 263dc33b70..9baf69d02b 100644
--- a/transformers/modeling_t5.py
+++ b/transformers/modeling_t5.py
@@ -693,7 +693,7 @@ class T5Model(T5PreTrainedModel):
         tokenizer = T5Tokenizer.from_pretrained('t5-small')
         model = T5Model.from_pretrained('t5-small')
         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids)
+        outputs = model(input_ids=input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
@@ -798,7 +798,7 @@ class T5WithLMHeadModel(T5PreTrainedModel):
         tokenizer = T5Tokenizer.from_pretrained('t5-small')
         model = T5WithLMHeadModel.from_pretrained('t5-small')
         input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
-        outputs = model(input_ids, lm_labels=input_ids)
+        outputs = model(input_ids=input_ids, lm_labels=input_ids)
         loss, prediction_scores = outputs[:2]
 
     """
diff --git a/transformers/modeling_tf_t5.py b/transformers/modeling_tf_t5.py
index 1336a1c30d..e803e00c8d 100644
--- a/transformers/modeling_tf_t5.py
+++ b/transformers/modeling_tf_t5.py
@@ -610,7 +610,7 @@ class TFT5Model(TFT5PreTrainedModel):
         tokenizer = T5Tokenizer.from_pretrained('t5-small')
         model = TFT5Model.from_pretrained('t5-small')
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
+        outputs = model(input_ids=input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
 
     """
@@ -701,7 +701,7 @@ class TFT5WithLMHeadModel(TFT5PreTrainedModel):
         tokenizer = T5Tokenizer.from_pretrained('t5-small')
         model = TFT5WithLMHeadModel.from_pretrained('t5-small')
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
-        outputs = model(input_ids)
+        outputs = model(input_ids=input_ids)
         prediction_scores = outputs[0]
 
     """

From 284572efc05a6a8d9e351e886ea3cab0f5f2367a Mon Sep 17 00:00:00 2001
From: Ejar <ejarkm@gmail.com>
Date: Wed, 18 Dec 2019 17:47:47 +0100
Subject: [PATCH 107/110] Updated typo on the link

Updated documentation due to typo
---
 examples/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/README.md b/examples/README.md
index b6b3908810..fcd2fe1f6f 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -467,7 +467,7 @@ Training with the previously defined hyper-parameters yields the following resul
 ## Named Entity Recognition
 
 Based on the scripts [`run_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_ner.py) for Pytorch and
-[`run_tf_ner.py`(https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py)] for Tensorflow 2.
+[`run_tf_ner.py`](https://github.com/huggingface/transformers/blob/master/examples/run_tf_ner.py) for Tensorflow 2.
 This example fine-tune Bert Multilingual on GermEval 2014 (German NER).
 Details and results for the fine-tuning provided by @stefan-it.
 

From 62c1fc3c1ecdfab787ee3c34d1ec1eba65c18877 Mon Sep 17 00:00:00 2001
From: Francesco <francesco.cottone@vidiemme.it>
Date: Thu, 19 Dec 2019 14:43:10 +0100
Subject: [PATCH 108/110] Removed duplicate XLMConfig, XLMForQuestionAnswering
 and XLMTokenizer from import statement of run_squad.py script

---
 examples/run_squad.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index 34c31c3bb8..1ff6983f62 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -61,7 +61,6 @@ MODEL_CLASSES = {
     'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer),
     'distilbert': (DistilBertConfig, DistilBertForQuestionAnswering, DistilBertTokenizer),
     'albert': (AlbertConfig, AlbertForQuestionAnswering, AlbertTokenizer),
-    'xlm': (XLMConfig, XLMForQuestionAnswering, XLMTokenizer)
 }
 
 def set_seed(args):

From a1f1dce0ae511ef7766c6b6a8f5ebf9118279e73 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Thu, 19 Dec 2019 12:25:55 -0500
Subject: [PATCH 109/110] Correct max position for SQUAD and TFDS

---
 transformers/data/processors/squad.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/transformers/data/processors/squad.py b/transformers/data/processors/squad.py
index 84aa429e26..8e72bbbd6d 100644
--- a/transformers/data/processors/squad.py
+++ b/transformers/data/processors/squad.py
@@ -571,7 +571,9 @@ class SquadExample(object):
         # Start end end positions only has a value during evaluation.
         if start_position_character is not None and not is_impossible:
             self.start_position = char_to_word_offset[start_position_character]
-            self.end_position = char_to_word_offset[start_position_character + len(answer_text) - 1]
+            self.end_position = char_to_word_offset[
+                min(start_position_character + len(answer_text) - 1, len(char_to_word_offset) - 1)
+            ]
 
 
 class SquadFeatures(object):

From 33adab2b91697b3e78af618a21ab9f1176281165 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Thu, 19 Dec 2019 12:40:43 -0500
Subject: [PATCH 110/110] Fix albert example

---
 transformers/modeling_tf_albert.py | 4 ++--
 transformers/modeling_utils.py     | 5 -----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/transformers/modeling_tf_albert.py b/transformers/modeling_tf_albert.py
index d1650d41a8..ac55a73fa3 100644
--- a/transformers/modeling_tf_albert.py
+++ b/transformers/modeling_tf_albert.py
@@ -587,8 +587,8 @@ class TFAlbertModel(TFAlbertPreTrainedModel):
         import tensorflow as tf
         from transformers import AlbertTokenizer, TFAlbertModel
 
-        tokenizer = AlbertTokenizer.from_pretrained('bert-base-uncased')
-        model = TFAlbertModel.from_pretrained('bert-base-uncased')
+        tokenizer = AlbertTokenizer.from_pretrained('albert-base-v1')
+        model = TFAlbertModel.from_pretrained('albert-base-v1')
         input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute"))[None, :]  # Batch size 1
         outputs = model(input_ids)
         last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
diff --git a/transformers/modeling_utils.py b/transformers/modeling_utils.py
index 9bd99b25dc..eff54f71e1 100644
--- a/transformers/modeling_utils.py
+++ b/transformers/modeling_utils.py
@@ -327,11 +327,6 @@ class PreTrainedModel(nn.Module):
             model = BertModel.from_pretrained('./tf_model/my_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if pretrained_model_name_or_path is not None and (
-                "albert" in pretrained_model_name_or_path and "v2" in pretrained_model_name_or_path):
-            logger.warning("There is currently an upstream reproducibility issue with ALBERT v2 models. Please see " +
-                           "https://github.com/google-research/google-research/issues/119 for more information.")
-
         config = kwargs.pop('config', None)
         state_dict = kwargs.pop('state_dict', None)
         cache_dir = kwargs.pop('cache_dir', None)