From 71b47505175111dd391a5b9de9514fbe50558bf0 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Mon, 16 Dec 2019 16:37:27 +0100
Subject: [PATCH 01/26] examples: add support for XLM-RoBERTa to run_ner script

---
 examples/run_ner.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/examples/run_ner.py b/examples/run_ner.py
index 1ab1236d94..6426a6d1db 100644
--- a/examples/run_ner.py
+++ b/examples/run_ner.py
@@ -38,11 +38,13 @@ from transformers import WEIGHTS_NAME, BertConfig, BertForTokenClassification, B
 from transformers import RobertaConfig, RobertaForTokenClassification, RobertaTokenizer
 from transformers import DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer
 from transformers import CamembertConfig, CamembertForTokenClassification, CamembertTokenizer
+from transformers import XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer
 
 logger = logging.getLogger(__name__)
 
 ALL_MODELS = sum(
-    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)),
+    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig,
+                                                                   CamembertConfig, XLMRobertaConfig)),
     ())
 
 MODEL_CLASSES = {
@@ -50,6 +52,7 @@ MODEL_CLASSES = {
     "roberta": (RobertaConfig, RobertaForTokenClassification, RobertaTokenizer),
     "distilbert": (DistilBertConfig, DistilBertForTokenClassification, DistilBertTokenizer),
     "camembert": (CamembertConfig, CamembertForTokenClassification, CamembertTokenizer),
+    "xlmroberta": (XLMRobertaConfig, XLMRobertaForTokenClassification, XLMRobertaTokenizer),
 }
 
 

From d3549b66af6f225cace48f8462ba715508f51b0d Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Mon, 16 Dec 2019 16:38:39 +0100
Subject: [PATCH 02/26] module: add support for XLM-RoBERTa (__init__)

---
 transformers/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/transformers/__init__.py b/transformers/__init__.py
index 740d2440c2..910ba91457 100644
--- a/transformers/__init__.py
+++ b/transformers/__init__.py
@@ -49,6 +49,7 @@ from .tokenization_distilbert import DistilBertTokenizer
 from .tokenization_albert import AlbertTokenizer
 from .tokenization_camembert import CamembertTokenizer
 from .tokenization_t5 import T5Tokenizer
+from .tokenization_xlm_roberta import XLMRobertaTokenizer
 
 # Configurations
 from .configuration_utils import PretrainedConfig
@@ -65,6 +66,7 @@ from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CO
 from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 
 # Modeling
 if is_torch_available():
@@ -119,6 +121,9 @@ if is_torch_available():
                                 AlbertForQuestionAnswering,
                                 load_tf_weights_in_albert, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP)
 
+    from .modeling_xlm_roberta import (XLMRobertaForMaskedLM, XLMRobertaModel, XLMRobertaForMultipleChoice,
+                                       XLMRobertaForSequenceClassification, XLMRobertaForTokenClassification)
+
     # Optimization
     from .optimization import (AdamW, get_constant_schedule, get_constant_schedule_with_warmup, get_cosine_schedule_with_warmup,
                                get_cosine_with_hard_restarts_schedule_with_warmup, get_linear_schedule_with_warmup)

From 9ed09cb4a31518b13f2c58c057e43e029c32611a Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Mon, 16 Dec 2019 16:46:58 +0100
Subject: [PATCH 03/26] converter: add conversion script for original
 XLM-RoBERTa weights to Transformers-compatible weights

---
 ..._original_pytorch_checkpoint_to_pytorch.py | 184 ++++++++++++++++++
 1 file changed, 184 insertions(+)
 create mode 100644 transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py

diff --git a/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000..888adf4819
--- /dev/null
+++ b/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,184 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert RoBERTa checkpoint."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import numpy as np
+import torch
+import pathlib
+
+from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
+from fairseq.modules import TransformerSentenceEncoderLayer
+from transformers.modeling_bert import (BertConfig, BertEncoder,
+                                        BertIntermediate, BertLayer,
+                                        BertModel, BertOutput,
+                                        BertSelfAttention,
+                                        BertSelfOutput)
+from transformers.modeling_roberta import (RobertaEmbeddings,
+                                           RobertaForMaskedLM,
+                                           RobertaForSequenceClassification,
+                                           RobertaModel)
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+SAMPLE_TEXT = 'Hello world! cécé herlolip'
+
+
+def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head):
+    """
+    Copy/paste/tweak roberta's weights to our BERT structure.
+    """
+    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path, bpe = 'sentencepiece')
+    roberta.eval()  # disable dropout
+    config = BertConfig(
+        vocab_size_or_config_json_file=250004,
+        hidden_size=roberta.args.encoder_embed_dim,
+        num_hidden_layers=roberta.args.encoder_layers,
+        num_attention_heads=roberta.args.encoder_attention_heads,
+        intermediate_size=roberta.args.encoder_ffn_embed_dim,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        layer_norm_eps=1e-5, # PyTorch default used in fairseq
+    )
+    if classification_head:
+        config.num_labels = roberta.args.num_classes
+    print("Our BERT config:", config)
+
+    model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config)
+    model.eval()
+
+    # Now let's copy all the weights.
+    # Embeddings
+    roberta_sent_encoder = roberta.model.decoder.sentence_encoder
+    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
+    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
+    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight)  # just zero them out b/c RoBERTa doesn't use them.
+    model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
+    model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
+
+    for i in range(config.num_hidden_layers):
+        # Encoder: start of layer
+        layer: BertLayer = model.roberta.encoder.layer[i]
+        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
+
+        ### self attention
+        self_attn: BertSelfAttention = layer.attention.self
+        assert(
+            roberta_layer.self_attn.k_proj.weight.data.shape == \
+            roberta_layer.self_attn.q_proj.weight.data.shape == \
+            roberta_layer.self_attn.v_proj.weight.data.shape == \
+            torch.Size((config.hidden_size, config.hidden_size))
+        )
+
+        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
+        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
+        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
+        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
+        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
+        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
+
+        ### self-attention output
+        self_output: BertSelfOutput = layer.attention.output
+        assert(
+            self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
+        )
+        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
+        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
+        self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
+        self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
+
+        ### intermediate
+        intermediate: BertIntermediate = layer.intermediate
+        assert(
+            intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
+        )
+        intermediate.dense.weight = roberta_layer.fc1.weight
+        intermediate.dense.bias = roberta_layer.fc1.bias
+
+        ### output
+        bert_output: BertOutput = layer.output
+        assert(
+            bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
+        )
+        bert_output.dense.weight = roberta_layer.fc2.weight
+        bert_output.dense.bias = roberta_layer.fc2.bias
+        bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
+        bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
+        #### end of layer
+    
+    if classification_head:
+        model.classifier.dense.weight = roberta.model.classification_heads['mnli'].dense.weight
+        model.classifier.dense.bias = roberta.model.classification_heads['mnli'].dense.bias
+        model.classifier.out_proj.weight = roberta.model.classification_heads['mnli'].out_proj.weight
+        model.classifier.out_proj.bias = roberta.model.classification_heads['mnli'].out_proj.bias
+    else:
+        # LM Head
+        model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight
+        model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
+        model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
+        model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
+        model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
+        model.lm_head.bias = roberta.model.decoder.lm_head.bias
+
+    # Let's check that we get the same results.
+    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1
+
+    our_output = model(input_ids)[0]
+    if classification_head:
+        their_output = roberta.model.classification_heads['mnli'](roberta.extract_features(input_ids))
+    else:
+        their_output = roberta.model(input_ids)[0]
+    print(our_output.shape, their_output.shape)
+    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
+    print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
+    success = torch.allclose(our_output, their_output, atol=1e-3)
+    print(
+        "Do both models output the same tensors?",
+        "🔥" if success else "💩"
+    )
+    if not success:
+        raise Exception("Something went wRoNg")
+
+    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--roberta_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the official PyTorch dump.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    parser.add_argument("--classification_head",
+                        action = "store_true",
+                        help = "Whether to convert a final classification head.")
+    args = parser.parse_args()
+    convert_roberta_checkpoint_to_pytorch(
+        args.roberta_checkpoint_path,
+        args.pytorch_dump_folder_path,
+        args.classification_head
+    )

From a648ff738c88a41cfae4f915a4391c7d66261b64 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Mon, 16 Dec 2019 16:47:39 +0100
Subject: [PATCH 04/26] configuration: add support for XLM-RoBERTa model

---
 transformers/configuration_xlm_roberta.py | 33 +++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 transformers/configuration_xlm_roberta.py

diff --git a/transformers/configuration_xlm_roberta.py b/transformers/configuration_xlm_roberta.py
new file mode 100644
index 0000000000..1633cc18aa
--- /dev/null
+++ b/transformers/configuration_xlm_roberta.py
@@ -0,0 +1,33 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" XLM-RoBERTa configuration """
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+
+from .configuration_roberta import RobertaConfig
+
+logger = logging.getLogger(__name__)
+
+XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'xlm-roberta-base': "https://schweter.eu/cloud/transformers/xlm-roberta-large-config.json",
+}
+
+
+class XLMRobertaConfig(RobertaConfig):
+    pretrained_config_archive_map = XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP

From 69f4f058fa5ecc6fea8c65ae59694442bba795e6 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Mon, 16 Dec 2019 17:00:12 +0100
Subject: [PATCH 05/26] model: add support for new XLM-RoBERTa model

---
 transformers/modeling_xlm_roberta.py | 293 +++++++++++++++++++++++++++
 1 file changed, 293 insertions(+)
 create mode 100644 transformers/modeling_xlm_roberta.py

diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py
new file mode 100644
index 0000000000..8402be4b5c
--- /dev/null
+++ b/transformers/modeling_xlm_roberta.py
@@ -0,0 +1,293 @@
+# coding=utf-8
+# Copyright 2019 Facebook AI Research and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch XLM-RoBERTa model. """
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+
+from .modeling_roberta import RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification, RobertaForMultipleChoice, RobertaForTokenClassification
+from .configuration_xlm_roberta import XLMRobertaConfig
+from .file_utils import add_start_docstrings
+
+logger = logging.getLogger(__name__)
+
+XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-pytorch_model.bin",
+}
+
+
+XLM_ROBERTA_START_DOCSTRING = r"""    The XLM-RoBERTa model was proposed in
+    `Unsupervised Cross-lingual Representation Learning at Scale`_
+    by Alexis Conneau, Kartikay Khandelwal, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov. It is based on Facebook's RoBERTa model released in 2019.
+    
+    It is a large multi-lingual language model, trained on 2.5TB of filtered CommonCrawl data.
+
+    This implementation is the same as RoBERTa.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`Unsupervised Cross-lingual Representation Learning at Scale`:
+        https://arxiv.org/abs/1911.02116
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~transformers.XLMRobertaConfig`): Model configuration class with all the parameters of the 
+            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model weights.
+"""
+
+XLM_ROBERTA_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, XLM-RoBERTa input sequence should be formatted with <s> and </s> tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         <s> Is this Jacksonville ? </s> </s> No it is not . </s>``
+
+            (b) For single sequences:
+
+                ``tokens:         <s> the dog is hairy . </s>``
+
+            Fully encoded sequences or sequence pairs can be obtained using the XLMRobertaTokenizer.encode function with 
+            the ``add_special_tokens`` parameter set to ``True``.
+
+            XLM-RoBERTa is a model with absolute position embeddings so it's usually advised to pad the inputs on
+            the right rather than the left.
+
+            See :func:`transformers.PreTrainedTokenizer.encode` and
+            :func:`transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **token_type_ids**: (`optional` need to be trained) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Optional segment token indices to indicate first and second portions of the inputs.
+            This embedding matrice is not trained (not pretrained during XLM-RoBERTa pretraining), you will have to train it
+            during finetuning.
+            Indices are selected in ``[0, 1]``: ``0`` corresponds to a `sentence A` token, ``1``
+            corresponds to a `sentence B` token
+            (see `BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding`_ for more details).
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+        **inputs_embeds**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, embedding_dim)``:
+            Optionally, instead of passing ``input_ids`` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+"""
+
+@add_start_docstrings("The bare XLM-RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+                      XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaModel(RobertaModel):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            eo match pre-training, XLM-RoBERTa input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] [SEP] no it is not . [SEP]``
+
+                ``token_type_ids:   0   0  0    0    0     0       0   0   0     1  1  1  1   1   1``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+
+                ``token_type_ids:   0   0   0   0  0     0   0``
+
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaModel.from_pretrained('xlm-roberta-large')
+        input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@add_start_docstrings("""XLM-RoBERTa Model with a `language modeling` head on top. """,
+    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaForMaskedLM(RobertaForMaskedLM):
+    r"""
+        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaForMaskedLM.from_pretrained('xlm-roberta-large')
+        input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, masked_lm_labels=input_ids)
+        loss, prediction_scores = outputs[:2]
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@add_start_docstrings("""XLM-RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+    on top of the pooled output) e.g. for GLUE tasks. """,
+    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaForSequenceClassification(RobertaForSequenceClassification):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaForSequenceClassification.from_pretrained('xlm-roberta-large')
+        input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@add_start_docstrings("""XLM-RoBERTa Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks. """,
+    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaForMultipleChoice(RobertaForMultipleChoice):
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **classification_scores**: ``torch.FloatTensor`` of shape ``(batch_size, num_choices)`` where `num_choices` is the size of the second dimension
+            of the input tensors. (see `input_ids` above).
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaForMultipleChoice.from_pretrained('xlm-roberta-large')
+        choices = ["Schloß Nymphenburg ist sehr schön .", "Der Schloßkanal auch !"]
+        input_ids = torch.tensor([tokenizer.encode(s, add_special_tokens=True) for s in choices]).unsqueeze(0)  # Batch size 1, 2 choices
+        labels = torch.tensor(1).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, classification_scores = outputs[:2]
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+
+@add_start_docstrings("""XLM-RoBERTa Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks. """,
+    XLM_ROBERTA_START_DOCSTRING, XLM_ROBERTA_INPUTS_DOCSTRING)
+class XLMRobertaForTokenClassification(RobertaForTokenClassification):
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the token classification loss.
+            Indices should be in ``[0, ..., config.num_labels - 1]``.
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification loss.
+        **scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.num_labels)``
+            Classification scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
+        model = XLMRobertaForTokenClassification.from_pretrained('xlm-roberta-large')
+        input_ids = torch.tensor(tokenizer.encode("Schloß Nymphenburg ist sehr schön .", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1] * input_ids.size(1)).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, scores = outputs[:2]
+
+    """
+    config_class = XLMRobertaConfig
+    pretrained_model_archive_map = XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP

From 59a1aefb1ca51b183bffa2d355bc2a22a7c51274 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Mon, 16 Dec 2019 17:00:55 +0100
Subject: [PATCH 06/26] tokenization: add support for new XLM-RoBERTa model.
 Add wrapper around fairseq tokenization logic

---
 transformers/tokenization_xlm_roberta.py | 154 +++++++++++++++++++++++
 1 file changed, 154 insertions(+)
 create mode 100644 transformers/tokenization_xlm_roberta.py

diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py
new file mode 100644
index 0000000000..0f95397606
--- /dev/null
+++ b/transformers/tokenization_xlm_roberta.py
@@ -0,0 +1,154 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+""" Tokenization classes for XLM-RoBERTa model."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+import os
+from shutil import copyfile
+
+import sentencepiece as spm
+from transformers.tokenization_utils import PreTrainedTokenizer
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+    'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-sentencepiece.bpe.model",
+    }
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'xlm-roberta-large': None,
+}
+
+class XLMRobertaTokenizer(PreTrainedTokenizer):
+    """
+        Adapted from RobertaTokenizer and XLNetTokenizer
+        SentencePiece based tokenizer. Peculiarities:
+
+            - requires `SentencePiece <https://github.com/google/sentencepiece>`_
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, vocab_file, bos_token="<s>", eos_token="</s>", sep_token="</s>",
+                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>',
+                 **kwargs):
+        super(XLMRobertaTokenizer, self).__init__(max_len=512, bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
+                                                 sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
+                                                 mask_token=mask_token,
+                                                 **kwargs)
+        self.max_len_single_sentence = self.max_len - 2  # take into account special tokens
+        self.max_len_sentences_pair = self.max_len - 4  # take into account special tokens
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+        self.fairseq_tokens_to_ids = {"<s>": 0, "<unk>": 1, "</s>": 2}
+        self.fairseq_tokens_to_ids['<mask>'] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
+        by concatenating and adding special tokens.
+        A RoBERTa sequence has the following format:
+            single sequence: <s> X </s>
+            pair of sequences: <s> A </s></s> B </s>
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` or ``encode_plus`` methods.
+
+        Args:
+            token_ids_0: list of ids (must not contain special tokens)
+            token_ids_1: Optional list of ids (must not contain special tokens), necessary when fetching sequence ids
+                for sequence pairs
+            already_has_special_tokens: (default False) Set to True if the token list is already formated with
+                special tokens for the model
+
+        Returns:
+            A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError("You should not supply a second sequence if the provided sequence of "
+                                 "ids is already formated with special tokens for the model.")
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None):
+        """
+        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
+        A RoBERTa sequence pair mask has the following format:
+        0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1
+        | first sequence    | second sequence
+
+        if token_ids_1 is None, only returns the first portion of the mask (0's).
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep) * [0] + len(token_ids_1 + sep) * [1]
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + len(self.fairseq_tokens_to_ids)
+
+    def _tokenize(self, text):
+        return self.sp_model.EncodeAsPieces(text)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        return self.sp_model.PieceToId(token) + 1
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index + 1)
+
+    def save_vocabulary(self, save_directory):
+        """ Save the sentencepiece vocabulary (copy original file) and special tokens file
+            to a directory.
+        """
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)

From a701a0cee1ae6cb7b93b047cc3ffc06b01157955 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Mon, 16 Dec 2019 17:17:56 +0100
Subject: [PATCH 07/26] configuration: fix model name for large XLM-RoBERTa
 model

---
 transformers/configuration_xlm_roberta.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/configuration_xlm_roberta.py b/transformers/configuration_xlm_roberta.py
index 1633cc18aa..dd03572976 100644
--- a/transformers/configuration_xlm_roberta.py
+++ b/transformers/configuration_xlm_roberta.py
@@ -25,7 +25,7 @@ from .configuration_roberta import RobertaConfig
 logger = logging.getLogger(__name__)
 
 XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'xlm-roberta-base': "https://schweter.eu/cloud/transformers/xlm-roberta-large-config.json",
+    'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-config.json",
 }
 
 

From d064009b72c4a29cd66b6c633dcd8c3ad5ab6dca Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Mon, 16 Dec 2019 17:23:25 +0100
Subject: [PATCH 08/26] converter: fix vocab size

---
 ...onvert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py
index 888adf4819..6873f1d0f0 100644
--- a/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -47,7 +47,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
     roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path, bpe = 'sentencepiece')
     roberta.eval()  # disable dropout
     config = BertConfig(
-        vocab_size_or_config_json_file=250004,
+        vocab_size_or_config_json_file=250002,
         hidden_size=roberta.args.encoder_embed_dim,
         num_hidden_layers=roberta.args.encoder_layers,
         num_attention_heads=roberta.args.encoder_attention_heads,

From f349826a57e6f7f1eb5c28ef3b3ff0ac6884ad24 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan.schweter@bsb-muenchen.de>
Date: Tue, 17 Dec 2019 10:36:04 +0100
Subject: [PATCH 09/26] model: fix cls and sep token for XLM-RoBERTa
 documentation

---
 transformers/modeling_xlm_roberta.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py
index 8402be4b5c..4c833c69ff 100644
--- a/transformers/modeling_xlm_roberta.py
+++ b/transformers/modeling_xlm_roberta.py
@@ -111,17 +111,17 @@ class XLMRobertaModel(RobertaModel):
             Last layer hidden-state of the first token of the sequence (classification token)
             further processed by a Linear layer and a Tanh activation function. The Linear
             layer weights are trained from the next sentence prediction (classification)
-            eo match pre-training, XLM-RoBERTa input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+            eo match pre-training, XLM-RoBERTa input sequence should be formatted with <s> and </s> tokens as follows:
 
             (a) For sequence pairs:
 
-                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP] [SEP] no it is not . [SEP]``
+                ``tokens:         <s> is this jack ##son ##ville ? </s> </s> no it is not . </s>``
 
                 ``token_type_ids:   0   0  0    0    0     0       0   0   0     1  1  1  1   1   1``
 
             (b) For single sequences:
 
-                ``tokens:         [CLS] the dog is hairy . [SEP]``
+                ``tokens:         <s> the dog is hairy . </s>``
 
                 ``token_type_ids:   0   0   0   0  0     0   0``
 

From 2f1c745cded91b2f6cfed5b502ea5cbd7d6b9ac7 Mon Sep 17 00:00:00 2001
From: thomwolf <thomwolf@gmail.com>
Date: Tue, 17 Dec 2019 11:47:54 +0100
Subject: [PATCH 10/26] update conversion script

---
 ...onvert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py
index 6873f1d0f0..884c273d2c 100644
--- a/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -47,7 +47,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
     roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path, bpe = 'sentencepiece')
     roberta.eval()  # disable dropout
     config = BertConfig(
-        vocab_size_or_config_json_file=250002,
+        vocab_size=250002,
         hidden_size=roberta.args.encoder_embed_dim,
         num_hidden_layers=roberta.args.encoder_layers,
         num_attention_heads=roberta.args.encoder_attention_heads,

From ca31abc6d6fe35a39703ed36775853595149e956 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 18 Dec 2019 11:36:54 +0100
Subject: [PATCH 11/26] tokenization: *align* fairseq and spm vocab to fix some
 tokenization errors

---
 transformers/tokenization_xlm_roberta.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py
index 0f95397606..d8484e7f9c 100644
--- a/transformers/tokenization_xlm_roberta.py
+++ b/transformers/tokenization_xlm_roberta.py
@@ -61,7 +61,19 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         self.sp_model = spm.SentencePieceProcessor()
         self.sp_model.Load(str(vocab_file))
         self.vocab_file = vocab_file
-        self.fairseq_tokens_to_ids = {"<s>": 0, "<unk>": 1, "</s>": 2}
+
+        # Original fairseq vocab and spm vocab must be "aligned":
+        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
+        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
+        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
+        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+
+        # Mimic fairseq token-to-id alignment for the first 4 token
+        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
+
+        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.fairseq_offset = 1
+
         self.fairseq_tokens_to_ids['<mask>'] = len(self.sp_model) + len(self.fairseq_tokens_to_ids)
         self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
 
@@ -131,13 +143,13 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
         """ Converts a token (str/unicode) in an id using the vocab. """
         if token in self.fairseq_tokens_to_ids:
             return self.fairseq_tokens_to_ids[token]
-        return self.sp_model.PieceToId(token) + 1
+        return self.sp_model.PieceToId(token) + self.fairseq_offset
 
     def _convert_id_to_token(self, index):
         """Converts an index (integer) in a token (string/unicode) using the vocab."""
         if index in self.fairseq_ids_to_tokens:
             return self.fairseq_ids_to_tokens[index]
-        return self.sp_model.IdToPiece(index + 1)
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
 
     def save_vocabulary(self, save_directory):
         """ Save the sentencepiece vocabulary (copy original file) and special tokens file

From 01b68be34f906a210b0c0a5e2bd9bc605c5983f2 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 18 Dec 2019 12:24:46 +0100
Subject: [PATCH 12/26] converter: remove XLM-RoBERTa specific script (can be
 done with the script for RoBERTa now)

---
 ..._original_pytorch_checkpoint_to_pytorch.py | 184 ------------------
 1 file changed, 184 deletions(-)
 delete mode 100644 transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py

diff --git a/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py b/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py
deleted file mode 100644
index 884c273d2c..0000000000
--- a/transformers/convert_xlm_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ /dev/null
@@ -1,184 +0,0 @@
-# coding=utf-8
-# Copyright 2018 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Convert RoBERTa checkpoint."""
-
-from __future__ import absolute_import, division, print_function
-
-import argparse
-import logging
-import numpy as np
-import torch
-import pathlib
-
-from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
-from fairseq.modules import TransformerSentenceEncoderLayer
-from transformers.modeling_bert import (BertConfig, BertEncoder,
-                                        BertIntermediate, BertLayer,
-                                        BertModel, BertOutput,
-                                        BertSelfAttention,
-                                        BertSelfOutput)
-from transformers.modeling_roberta import (RobertaEmbeddings,
-                                           RobertaForMaskedLM,
-                                           RobertaForSequenceClassification,
-                                           RobertaModel)
-
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
-
-SAMPLE_TEXT = 'Hello world! cécé herlolip'
-
-
-def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head):
-    """
-    Copy/paste/tweak roberta's weights to our BERT structure.
-    """
-    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path, bpe = 'sentencepiece')
-    roberta.eval()  # disable dropout
-    config = BertConfig(
-        vocab_size=250002,
-        hidden_size=roberta.args.encoder_embed_dim,
-        num_hidden_layers=roberta.args.encoder_layers,
-        num_attention_heads=roberta.args.encoder_attention_heads,
-        intermediate_size=roberta.args.encoder_ffn_embed_dim,
-        max_position_embeddings=514,
-        type_vocab_size=1,
-        layer_norm_eps=1e-5, # PyTorch default used in fairseq
-    )
-    if classification_head:
-        config.num_labels = roberta.args.num_classes
-    print("Our BERT config:", config)
-
-    model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config)
-    model.eval()
-
-    # Now let's copy all the weights.
-    # Embeddings
-    roberta_sent_encoder = roberta.model.decoder.sentence_encoder
-    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
-    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
-    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight)  # just zero them out b/c RoBERTa doesn't use them.
-    model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
-    model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
-
-    for i in range(config.num_hidden_layers):
-        # Encoder: start of layer
-        layer: BertLayer = model.roberta.encoder.layer[i]
-        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
-
-        ### self attention
-        self_attn: BertSelfAttention = layer.attention.self
-        assert(
-            roberta_layer.self_attn.k_proj.weight.data.shape == \
-            roberta_layer.self_attn.q_proj.weight.data.shape == \
-            roberta_layer.self_attn.v_proj.weight.data.shape == \
-            torch.Size((config.hidden_size, config.hidden_size))
-        )
-
-        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
-        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
-        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
-        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
-        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
-        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
-
-        ### self-attention output
-        self_output: BertSelfOutput = layer.attention.output
-        assert(
-            self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
-        )
-        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
-        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
-        self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
-        self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
-
-        ### intermediate
-        intermediate: BertIntermediate = layer.intermediate
-        assert(
-            intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
-        )
-        intermediate.dense.weight = roberta_layer.fc1.weight
-        intermediate.dense.bias = roberta_layer.fc1.bias
-
-        ### output
-        bert_output: BertOutput = layer.output
-        assert(
-            bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
-        )
-        bert_output.dense.weight = roberta_layer.fc2.weight
-        bert_output.dense.bias = roberta_layer.fc2.bias
-        bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
-        bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
-        #### end of layer
-    
-    if classification_head:
-        model.classifier.dense.weight = roberta.model.classification_heads['mnli'].dense.weight
-        model.classifier.dense.bias = roberta.model.classification_heads['mnli'].dense.bias
-        model.classifier.out_proj.weight = roberta.model.classification_heads['mnli'].out_proj.weight
-        model.classifier.out_proj.bias = roberta.model.classification_heads['mnli'].out_proj.bias
-    else:
-        # LM Head
-        model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight
-        model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
-        model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
-        model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
-        model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
-        model.lm_head.bias = roberta.model.decoder.lm_head.bias
-
-    # Let's check that we get the same results.
-    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1
-
-    our_output = model(input_ids)[0]
-    if classification_head:
-        their_output = roberta.model.classification_heads['mnli'](roberta.extract_features(input_ids))
-    else:
-        their_output = roberta.model(input_ids)[0]
-    print(our_output.shape, their_output.shape)
-    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
-    print(f"max_absolute_diff = {max_absolute_diff}") # ~ 1e-7
-    success = torch.allclose(our_output, their_output, atol=1e-3)
-    print(
-        "Do both models output the same tensors?",
-        "🔥" if success else "💩"
-    )
-    if not success:
-        raise Exception("Something went wRoNg")
-
-    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
-    print(f"Saving model to {pytorch_dump_folder_path}")
-    model.save_pretrained(pytorch_dump_folder_path)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    ## Required parameters
-    parser.add_argument("--roberta_checkpoint_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path the official PyTorch dump.")
-    parser.add_argument("--pytorch_dump_folder_path",
-                        default = None,
-                        type = str,
-                        required = True,
-                        help = "Path to the output PyTorch model.")
-    parser.add_argument("--classification_head",
-                        action = "store_true",
-                        help = "Whether to convert a final classification head.")
-    args = parser.parse_args()
-    convert_roberta_checkpoint_to_pytorch(
-        args.roberta_checkpoint_path,
-        args.pytorch_dump_folder_path,
-        args.classification_head
-    )

From 41a13a6375817ec3836bedcec67d12ad32bf0956 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 18 Dec 2019 18:20:27 +0100
Subject: [PATCH 13/26] auto: add XLMRoBERTa to auto configuration

---
 transformers/configuration_auto.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/transformers/configuration_auto.py b/transformers/configuration_auto.py
index 9fe58f173a..29e1a0ee1d 100644
--- a/transformers/configuration_auto.py
+++ b/transformers/configuration_auto.py
@@ -30,6 +30,7 @@ from .configuration_distilbert import DistilBertConfig, DISTILBERT_PRETRAINED_CO
 from .configuration_albert import AlbertConfig, ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_camembert import CamembertConfig, CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP
 from .configuration_t5 import T5Config, T5_PRETRAINED_CONFIG_ARCHIVE_MAP
+from .configuration_xlm_roberta import XLMRobertaConfig, XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 
 logger = logging.getLogger(__name__)
 
@@ -48,6 +49,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict((key, value)
         ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         CAMEMBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
         T5_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP,
         ]
     for key, value, in pretrained_map.items())
 
@@ -66,6 +68,7 @@ class AutoConfig(object):
             - contains `distilbert`: DistilBertConfig (DistilBERT model)
             - contains `albert`: AlbertConfig (ALBERT model)
             - contains `camembert`: CamembertConfig (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
             - contains `roberta`: RobertaConfig (RoBERTa model)
             - contains `bert`: BertConfig (Bert model)
             - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
@@ -91,6 +94,7 @@ class AutoConfig(object):
             - contains `distilbert`: DistilBertConfig (DistilBERT model)
             - contains `albert`: AlbertConfig (ALBERT model)
             - contains `camembert`: CamembertConfig (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaConfig (XLM-RoBERTa model)
             - contains `roberta`: RobertaConfig (RoBERTa model)
             - contains `bert`: BertConfig (Bert model)
             - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
@@ -152,6 +156,8 @@ class AutoConfig(object):
             return AlbertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif 'camembert' in pretrained_model_name_or_path:
             return CamembertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
@@ -170,4 +176,4 @@ class AutoConfig(object):
             return CTRLConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
+                         "'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))

From 036831e2791b84edb5a89db7a92af7c69f4ff37e Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 18 Dec 2019 18:23:42 +0100
Subject: [PATCH 14/26] auto: add XLM-RoBERTa to audo modeling

---
 transformers/modeling_auto.py | 20 +++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/transformers/modeling_auto.py b/transformers/modeling_auto.py
index 1a30ea4623..ca6c4525b8 100644
--- a/transformers/modeling_auto.py
+++ b/transformers/modeling_auto.py
@@ -30,6 +30,7 @@ from .modeling_distilbert import DistilBertModel, DistilBertForQuestionAnswering
 from .modeling_camembert import CamembertModel, CamembertForMaskedLM, CamembertForSequenceClassification, CamembertForMultipleChoice, CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_albert import AlbertModel, AlbertForMaskedLM, AlbertForSequenceClassification, AlbertForQuestionAnswering, ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP
 from .modeling_t5 import T5Model, T5WithLMHeadModel, T5_PRETRAINED_MODEL_ARCHIVE_MAP
+from .modeling_xlm_roberta import XLMRobertaModel, XLMRobertaForMaskedLM, XLMRobertaForSequenceClassification, XLMRobertaForMultipleChoice, XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_utils import PreTrainedModel, SequenceSummary
 
@@ -52,6 +53,7 @@ ALL_PRETRAINED_MODEL_ARCHIVE_MAP = dict((key, value)
         ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         CAMEMBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
         T5_PRETRAINED_MODEL_ARCHIVE_MAP,
+        XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP,
         ]
     for key, value, in pretrained_map.items())
 
@@ -72,6 +74,7 @@ class AutoModel(object):
             - contains `distilbert`: DistilBertModel (DistilBERT model)
             - contains `albert`: AlbertModel (ALBERT model)
             - contains `camembert`: CamembertModel (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaModel (XLM-RoBERTa model)
             - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
             - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
@@ -98,6 +101,7 @@ class AutoModel(object):
             - contains `distilbert`: DistilBertModel (DistilBERT model)
             - contains `albert`: AlbertModel (ALBERT model)
             - contains `camembert`: CamembertModel (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaModel (XLM-RoBERTa model)
             - contains `roberta`: RobertaModel (RoBERTa model)
             - contains `bert`: BertModel (Bert model)
             - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
@@ -175,6 +179,8 @@ class AutoModel(object):
             return AlbertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'camembert' in pretrained_model_name_or_path:
             return CamembertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
@@ -193,7 +199,7 @@ class AutoModel(object):
             return CTRLModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+                         "'xlm-roberta', 'xlm', 'roberta, 'ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
 
 
 class AutoModelWithLMHead(object):
@@ -212,6 +218,7 @@ class AutoModelWithLMHead(object):
             - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
             - contains `albert`: AlbertForMaskedLM (ALBERT model)
             - contains `camembert`: CamembertForMaskedLM (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaForMaskedLM (XLM-RoBERTa model)
             - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
             - contains `bert`: BertForMaskedLM (Bert model)
             - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
@@ -241,6 +248,7 @@ class AutoModelWithLMHead(object):
             - contains `distilbert`: DistilBertForMaskedLM (DistilBERT model)
             - contains `albert`: AlbertForMaskedLM (ALBERT model)
             - contains `camembert`: CamembertForMaskedLM (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaForMaskedLM (XLM-RoBERTa model)
             - contains `roberta`: RobertaForMaskedLM (RoBERTa model)
             - contains `bert`: BertForMaskedLM (Bert model)
             - contains `openai-gpt`: OpenAIGPTLMHeadModel (OpenAI GPT model)
@@ -317,6 +325,8 @@ class AutoModelWithLMHead(object):
             return AlbertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'camembert' in pretrained_model_name_or_path:
             return CamembertForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaForMaskedLM.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
@@ -335,7 +345,7 @@ class AutoModelWithLMHead(object):
             return CTRLLMHeadModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+                         "'xlm-roberta', 'xlm', 'roberta','ctrl', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
 
 
 class AutoModelForSequenceClassification(object):
@@ -353,6 +363,7 @@ class AutoModelForSequenceClassification(object):
             - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
             - contains `albert`: AlbertForSequenceClassification (ALBERT model)
             - contains `camembert`: CamembertForSequenceClassification (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaForSequenceClassification (XLM-RoBERTa model)
             - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
             - contains `bert`: BertForSequenceClassification (Bert model)
             - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
@@ -377,6 +388,7 @@ class AutoModelForSequenceClassification(object):
             - contains `distilbert`: DistilBertForSequenceClassification (DistilBERT model)
             - contains `albert`: AlbertForSequenceClassification (ALBERT model)
             - contains `camembert`: CamembertForSequenceClassification (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaForSequenceClassification (XLM-RoBERTa model)
             - contains `roberta`: RobertaForSequenceClassification (RoBERTa model)
             - contains `bert`: BertForSequenceClassification (Bert model)
             - contains `xlnet`: XLNetForSequenceClassification (XLNet model)
@@ -448,6 +460,8 @@ class AutoModelForSequenceClassification(object):
             return AlbertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'camembert' in pretrained_model_name_or_path:
             return CamembertForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'bert' in pretrained_model_name_or_path:
@@ -458,7 +472,7 @@ class AutoModelForSequenceClassification(object):
             return XLMForSequenceClassification.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
-                         "'bert', 'xlnet', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
+                         "'bert', 'xlnet', 'xlm-roberta', 'xlm', 'roberta', 'distilbert', 'camembert', 'albert'".format(pretrained_model_name_or_path))
 
 
 class AutoModelForQuestionAnswering(object):

From 64a971a9156788ed6d95f850453578ecb74069c5 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 18 Dec 2019 18:24:32 +0100
Subject: [PATCH 15/26] auto: add XLM-RoBERTa to auto tokenization

---
 transformers/tokenization_auto.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/transformers/tokenization_auto.py b/transformers/tokenization_auto.py
index 173dee0e2b..5377bd48cb 100644
--- a/transformers/tokenization_auto.py
+++ b/transformers/tokenization_auto.py
@@ -31,6 +31,7 @@ from .tokenization_distilbert import DistilBertTokenizer
 from .tokenization_camembert import CamembertTokenizer
 from .tokenization_albert import AlbertTokenizer
 from .tokenization_t5 import T5Tokenizer
+from .tokenization_xlm_roberta import XLMRobertaTokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -49,6 +50,7 @@ class AutoTokenizer(object):
             - contains `distilbert`: DistilBertTokenizer (DistilBert model)
             - contains `albert`: AlbertTokenizer (ALBERT model)
             - contains `camembert`: CamembertTokenizer (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model)
             - contains `roberta`: RobertaTokenizer (RoBERTa model)
             - contains `bert`: BertTokenizer (Bert model)
             - contains `openai-gpt`: OpenAIGPTTokenizer (OpenAI GPT model)
@@ -75,6 +77,7 @@ class AutoTokenizer(object):
             - contains `distilbert`: DistilBertTokenizer (DistilBert model)
             - contains `albert`: AlbertTokenizer (ALBERT model)
             - contains `camembert`: CamembertTokenizer (CamemBERT model)
+            - contains `xlm-roberta`: XLMRobertaTokenizer (XLM-RoBERTa model)
             - contains `roberta`: RobertaTokenizer (RoBERTa model)
             - contains `bert-base-japanese`: BertJapaneseTokenizer (Bert model)
             - contains `bert`: BertTokenizer (Bert model)
@@ -130,6 +133,8 @@ class AutoTokenizer(object):
             return AlbertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'camembert' in pretrained_model_name_or_path:
             return CamembertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'xlm-roberta' in pretrained_model_name_or_path:
+            return XLMRobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'roberta' in pretrained_model_name_or_path:
             return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'bert-base-japanese' in pretrained_model_name_or_path:
@@ -150,4 +155,4 @@ class AutoTokenizer(object):
             return CTRLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))
+                         "'xlm-roberta', 'xlm', 'roberta', 'distilbert,' 'camembert', 'ctrl', 'albert'".format(pretrained_model_name_or_path))

From e778dd854dd5d1fd29396d214577ddbe0f854247 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 18 Dec 2019 19:27:34 +0100
Subject: [PATCH 16/26] modeling: add XLM-RoBERTa base model

---
 transformers/modeling_xlm_roberta.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py
index 4c833c69ff..abace25d5b 100644
--- a/transformers/modeling_xlm_roberta.py
+++ b/transformers/modeling_xlm_roberta.py
@@ -27,6 +27,7 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'xlm-roberta-base': "https://schweter.eu/cloud/transformers/xlm-roberta-base-pytorch_model.bin",
     'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-pytorch_model.bin",
 }
 

From 128cfdee9bdd13f2f0bbef977907ff014ae398ad Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 18 Dec 2019 19:28:16 +0100
Subject: [PATCH 17/26] tokenization add XLM-RoBERTa base model

---
 transformers/tokenization_xlm_roberta.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py
index d8484e7f9c..93b1c397e4 100644
--- a/transformers/tokenization_xlm_roberta.py
+++ b/transformers/tokenization_xlm_roberta.py
@@ -30,11 +30,13 @@ VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
 PRETRAINED_VOCAB_FILES_MAP = {
     'vocab_file':
     {
+    'xlm-roberta-base': "https://schweter.eu/cloud/transformers/xlm-roberta-base-sentencepiece.bpe.model",
     'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-sentencepiece.bpe.model",
     }
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'xlm-roberta-base': None,
     'xlm-roberta-large': None,
 }
 

From 3e89fca54359bff728782cce4157118caed33c09 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 18 Dec 2019 19:44:23 +0100
Subject: [PATCH 18/26] readme: add XLM-RoBERTa to model architecture list

---
 README.md | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index c33a65bdbb..a9d0fb3ace 100644
--- a/README.md
+++ b/README.md
@@ -146,7 +146,8 @@ At some point in the future, you'll be able to seamlessly move from pre-training
 10. **[CamemBERT](https://camembert-model.fr)** (from Inria/Facebook/Sorbonne) released with the paper [CamemBERT: a Tasty French Language Model](https://arxiv.org/abs/1911.03894) by Louis Martin*, Benjamin Muller*, Pedro Javier Ortiz Suárez*, Yoann Dupont, Laurent Romary, Éric Villemonte de la Clergerie, Djamé Seddah and Benoît Sagot.
 11. **[ALBERT](https://github.com/google-research/ALBERT)** (from Google Research and the Toyota Technological Institute at Chicago) released with the paper [ALBERT: A Lite BERT for Self-supervised Learning of Language Representations](https://arxiv.org/abs/1909.11942), by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
 12. **[T5](https://github.com/google-research/text-to-text-transfer-transformer)** (from Google AI) released with the paper [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://arxiv.org/abs/1910.10683) by Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu.
-13. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
+13. **[XLM-RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/xlmr)** (from Facebook AI), released together with the paper [Unsupervised Cross-lingual Representation Learning at Scale](https://arxiv.org/abs/1911.02116) by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+14. Want to contribute a new model? We have added a **detailed guide and templates** to guide you in the process of adding a new model. You can find them in the [`templates`](./templates) folder of the repository. Be sure to check the [contributing guidelines](./CONTRIBUTING.md) and contact the maintainers or open an issue to collect feedbacks before starting your PR.
 
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/transformers/examples.html).
 
@@ -168,7 +169,7 @@ import torch
 from transformers import *
 
 # Transformers has a unified API
-# for 8 transformer architectures and 30 pretrained weights.
+# for 10 transformer architectures and 30 pretrained weights.
 #          Model          | Tokenizer          | Pretrained weights shortcut
 MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
           (OpenAIGPTModel,  OpenAIGPTTokenizer,  'openai-gpt'),
@@ -178,7 +179,9 @@ MODELS = [(BertModel,       BertTokenizer,       'bert-base-uncased'),
           (XLNetModel,      XLNetTokenizer,      'xlnet-base-cased'),
           (XLMModel,        XLMTokenizer,        'xlm-mlm-enfr-1024'),
           (DistilBertModel, DistilBertTokenizer, 'distilbert-base-uncased'),
-          (RobertaModel,    RobertaTokenizer,    'roberta-base')]
+          (RobertaModel,    RobertaTokenizer,    'roberta-base'),
+          (XLMRobertaModel, XLMRobertaTokenizer, 'xlm-roberta-base'),
+         ]
 
 # To use TensorFlow 2.0 versions of the models, simply prefix the class names with 'TF', e.g. `TFRobertaModel` is the TF 2.0 counterpart of the PyTorch model `RobertaModel`
 

From d35405b7a32eadf4fb1200249b2bbc4c12fb0340 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 18 Dec 2019 19:45:10 +0100
Subject: [PATCH 19/26] docs: add XLM-RoBERTa to index page

---
 docs/source/index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index 48282c1c6c..cb34c5c7f0 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -50,6 +50,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
 9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
 11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
+13. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 
 .. toctree::
     :maxdepth: 2

From dd7a958fd6963d09850ad4842307d1d1064d096d Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 18 Dec 2019 19:45:46 +0100
Subject: [PATCH 20/26] docs: add XLM-RoBERTa to pretrained model list (incl.
 all parameters)

---
 docs/source/pretrained_models.rst | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index 7d037da34f..a359990f5a 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -240,6 +240,12 @@ Here is the full list of the currently provided pretrained models together with
 |                   | ``t5-11B``                                                 | | ~11B parameters with 24-layers, 1024-hidden-state, 65536 feed-forward hidden-state, 128-heads,                                      |
 |                   |                                                            | | Trained on English text: the Colossal Clean Crawled Corpus (C4)                                                                     |
 +-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| XLM-RoBERTa       | ``xlm-roberta-base``                                       | | ~125M parameters with 12-layers, 768-hidden-state, 3072 feed-forward hidden-state, 8-heads,                                         |
+|                   |                                                            | | Trained on on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                       |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-roberta-large``                                      | | ~355M parameters with 24-layers, 1027-hidden-state, 4096 feed-forward hidden-state, 16-heads,                                       |
+|                   |                                                            | | Trained on 2.5 TB of newly created clean CommonCrawl data in 100 languages                                                          |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 
 
 .. <https://huggingface.co/transformers/examples.html>`__

From f09d9996413f2b265f1c672d7a4b438e4c5099c4 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 18 Dec 2019 19:49:33 +0100
Subject: [PATCH 21/26] =?UTF-8?q?docs:=20fix=20numbering=20=F0=9F=98=85?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/source/index.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/index.rst b/docs/source/index.rst
index cb34c5c7f0..0ac9c740a5 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -50,7 +50,7 @@ The library currently contains PyTorch and Tensorflow implementations, pre-train
 9. `CTRL <https://github.com/pytorch/fairseq/tree/master/examples/ctrl>`_ (from Salesforce), released together with the paper `CTRL: A Conditional Transformer Language Model for Controllable Generation <https://www.github.com/salesforce/ctrl>`_ by Nitish Shirish Keskar*, Bryan McCann*, Lav R. Varshney, Caiming Xiong and Richard Socher.
 10. `CamemBERT <https://huggingface.co/transformers/model_doc/camembert.html>`_ (from FAIR, Inria, Sorbonne Université) released together with the paper `CamemBERT: a Tasty French Language Model <https://arxiv.org/abs/1911.03894>`_ by Louis Martin, Benjamin Muller, Pedro Javier Ortiz Suarez, Yoann Dupont, Laurent Romary, Eric Villemonte de la Clergerie, Djame Seddah, and Benoît Sagot.
 11. `ALBERT <https://github.com/google-research/ALBERT>`_ (from Google Research), released together with the paper a `ALBERT: A Lite BERT for Self-supervised Learning of Language Representations <https://arxiv.org/abs/1909.11942>`_ by Zhenzhong Lan, Mingda Chen, Sebastian Goodman, Kevin Gimpel, Piyush Sharma, Radu Soricut.
-13. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
+12. `XLM-RoBERTa <https://github.com/pytorch/fairseq/tree/master/examples/xlmr>`_ (from Facebook AI), released together with the paper `Unsupervised Cross-lingual Representation Learning at Scale <https://arxiv.org/abs/1911.02116>`_ by Alexis Conneau*, Kartikay Khandelwal*, Naman Goyal, Vishrav Chaudhary, Guillaume Wenzek, Francisco Guzmán, Edouard Grave, Myle Ott, Luke Zettlemoyer and Veselin Stoyanov.
 
 .. toctree::
     :maxdepth: 2

From db90e1211433ef99f952e60fbe5ea578391b86b1 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 18 Dec 2019 23:46:33 +0100
Subject: [PATCH 22/26] configuration: use S3 location for XLM-RoBERTa model

---
 transformers/configuration_xlm_roberta.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/transformers/configuration_xlm_roberta.py b/transformers/configuration_xlm_roberta.py
index dd03572976..d7a26538c5 100644
--- a/transformers/configuration_xlm_roberta.py
+++ b/transformers/configuration_xlm_roberta.py
@@ -25,7 +25,8 @@ from .configuration_roberta import RobertaConfig
 logger = logging.getLogger(__name__)
 
 XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
-    'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-config.json",
+    'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
+    'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
 }
 
 

From 5c5f67a256b558b470f03cf36edc5ea35dec2fba Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 18 Dec 2019 23:47:00 +0100
Subject: [PATCH 23/26] modeling: use S3 location for XLM-RoBERTa model

---
 transformers/modeling_xlm_roberta.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py
index abace25d5b..8095c46a16 100644
--- a/transformers/modeling_xlm_roberta.py
+++ b/transformers/modeling_xlm_roberta.py
@@ -27,8 +27,8 @@ from .file_utils import add_start_docstrings
 logger = logging.getLogger(__name__)
 
 XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
-    'xlm-roberta-base': "https://schweter.eu/cloud/transformers/xlm-roberta-base-pytorch_model.bin",
-    'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-pytorch_model.bin",
+    'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-pytorch_model.bin",
+    'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-pytorch_model.bin",
 }
 
 

From fe9aab1055604e772be05a1cbbb36a207c177055 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Wed, 18 Dec 2019 23:47:48 +0100
Subject: [PATCH 24/26] tokenization: use S3 location for XLM-RoBERTa model

---
 transformers/tokenization_xlm_roberta.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py
index 93b1c397e4..453c4375c6 100644
--- a/transformers/tokenization_xlm_roberta.py
+++ b/transformers/tokenization_xlm_roberta.py
@@ -30,8 +30,8 @@ VOCAB_FILES_NAMES = {'vocab_file': 'sentencepiece.bpe.model'}
 PRETRAINED_VOCAB_FILES_MAP = {
     'vocab_file':
     {
-    'xlm-roberta-base': "https://schweter.eu/cloud/transformers/xlm-roberta-base-sentencepiece.bpe.model",
-    'xlm-roberta-large': "https://schweter.eu/cloud/transformers/xlm-roberta-large-sentencepiece.bpe.model",
+    'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model",
+    'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model",
     }
 }
 

From a26ce4dee116a1d5d9099c8a94e22d1e31ad631c Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Thu, 19 Dec 2019 02:23:01 +0100
Subject: [PATCH 25/26] examples: add XLM-RoBERTa to glue script

---
 examples/run_glue.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 1a51255c11..954a8fbf0c 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -52,6 +52,9 @@ from transformers import (WEIGHTS_NAME, BertConfig,
                                   AlbertConfig,
                                   AlbertForSequenceClassification, 
                                   AlbertTokenizer,
+                                  XLMRobertaConfig,
+                                  XLMRobertaForSequenceClassification,
+                                  XLMRobertaTokenizer,
                                 )
 
 from transformers import AdamW, get_linear_schedule_with_warmup
@@ -72,7 +75,8 @@ MODEL_CLASSES = {
     'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
     'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
     'distilbert': (DistilBertConfig, DistilBertForSequenceClassification, DistilBertTokenizer),
-    'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer)
+    'albert': (AlbertConfig, AlbertForSequenceClassification, AlbertTokenizer),
+    'xlmroberta': (XLMRobertaConfig, XLMRobertaForSequenceClassification, XLMRobertaTokenizer),
 }
 
 
@@ -304,9 +308,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
         label_list = processor.get_labels()
-        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
+        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta', 'xlmroberta']:
             # HACK(label indices are swapped in RoBERTa pretrained model)
-            label_list[1], label_list[2] = label_list[2], label_list[1] 
+            label_list[1], label_list[2] = label_list[2], label_list[1]
         examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
         features = convert_examples_to_features(examples,
                                                 tokenizer,

From 3376adc05157ba826acafd49f07f9a01ae30eb07 Mon Sep 17 00:00:00 2001
From: Stefan Schweter <stefan@schweter.it>
Date: Thu, 19 Dec 2019 21:30:23 +0100
Subject: [PATCH 26/26] configuration/modeling/tokenization: add various
 fine-tuned XLM-RoBERTa models for English, German, Spanish and Dutch (CoNLL
 datasets)

---
 transformers/configuration_xlm_roberta.py | 4 ++++
 transformers/modeling_xlm_roberta.py      | 4 ++++
 transformers/tokenization_xlm_roberta.py  | 4 ++++
 3 files changed, 12 insertions(+)

diff --git a/transformers/configuration_xlm_roberta.py b/transformers/configuration_xlm_roberta.py
index d7a26538c5..5b6955f4f8 100644
--- a/transformers/configuration_xlm_roberta.py
+++ b/transformers/configuration_xlm_roberta.py
@@ -27,6 +27,10 @@ logger = logging.getLogger(__name__)
 XLM_ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
     'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-config.json",
     'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-config.json",
+    'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-config.json",
+    'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-config.json",
+    'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-config.json",
+    'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-config.json",
 }
 
 
diff --git a/transformers/modeling_xlm_roberta.py b/transformers/modeling_xlm_roberta.py
index 8095c46a16..0bdce941a5 100644
--- a/transformers/modeling_xlm_roberta.py
+++ b/transformers/modeling_xlm_roberta.py
@@ -29,6 +29,10 @@ logger = logging.getLogger(__name__)
 XLM_ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
     'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-pytorch_model.bin",
     'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-pytorch_model.bin",
+    'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-pytorch_model.bin",
+    'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-pytorch_model.bin",
+    'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-pytorch_model.bin",
+    'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-pytorch_model.bin",
 }
 
 
diff --git a/transformers/tokenization_xlm_roberta.py b/transformers/tokenization_xlm_roberta.py
index 453c4375c6..4397e7b031 100644
--- a/transformers/tokenization_xlm_roberta.py
+++ b/transformers/tokenization_xlm_roberta.py
@@ -32,6 +32,10 @@ PRETRAINED_VOCAB_FILES_MAP = {
     {
     'xlm-roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-base-sentencepiece.bpe.model",
     'xlm-roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-sentencepiece.bpe.model",
+    'xlm-roberta-large-finetuned-conll02-dutch': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-dutch-sentencepiece.bpe.model",
+    'xlm-roberta-large-finetuned-conll02-spanish': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll02-spanish-sentencepiece.bpe.model",
+    'xlm-roberta-large-finetuned-conll03-english': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-english-sentencepiece.bpe.model",
+    'xlm-roberta-large-finetuned-conll03-german': "https://s3.amazonaws.com/models.huggingface.co/bert/xlm-roberta-large-finetuned-conll03-german-sentencepiece.bpe.model",
     }
 }