From c4e9615691a19128f446563718355aedf03cf01b Mon Sep 17 00:00:00 2001
From: Wei-Sheng Chin <wechi@microsoft.com>
Date: Wed, 17 Jul 2019 09:08:40 -0700
Subject: [PATCH 01/45] Fix a path so that test can run on Windows

---
 pytorch_transformers/tests/modeling_common_test.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pytorch_transformers/tests/modeling_common_test.py b/pytorch_transformers/tests/modeling_common_test.py
index 5ea98d68e2..e974ae865d 100644
--- a/pytorch_transformers/tests/modeling_common_test.py
+++ b/pytorch_transformers/tests/modeling_common_test.py
@@ -21,6 +21,7 @@ import os
 import shutil
 import json
 import random
+import uuid
 
 import unittest
 import logging
@@ -527,7 +528,7 @@ class ConfigTester(object):
 
     def create_and_test_config_to_json_file(self):
         config_first = self.config_class(**self.inputs_dict)
-        json_file_path = "/tmp/config.json"
+        json_file_path = os.path.join(os.getcwd(), "config_" + str(uuid.uuid4()) + ".json")
         config_first.to_json_file(json_file_path)
         config_second = self.config_class.from_json_file(json_file_path)
         os.remove(json_file_path)

From 05c083520ab58ae8a73d853d0e366e56d5690da4 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Sun, 4 Aug 2019 21:39:21 -0400
Subject: [PATCH 02/45] =?UTF-8?q?[RoBERTa]=20model=20conversion,=20inferen?=
 =?UTF-8?q?ce,=20tests=20=F0=9F=94=A5?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md                                     |   1 +
 .../convert_roberta_checkpoint_to_pytorch.py  | 164 +++++++++++++
 pytorch_transformers/modeling_roberta.py      | 128 ++++++++++
 .../tests/modeling_roberta_test.py            |  69 ++++++
 .../tests/tokenization_roberta_test.py        |  42 ++++
 pytorch_transformers/tokenization_roberta.py  | 218 ++++++++++++++++++
 6 files changed, 622 insertions(+)
 create mode 100644 pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
 create mode 100644 pytorch_transformers/modeling_roberta.py
 create mode 100644 pytorch_transformers/tests/modeling_roberta_test.py
 create mode 100644 pytorch_transformers/tests/tokenization_roberta_test.py
 create mode 100644 pytorch_transformers/tokenization_roberta.py

diff --git a/README.md b/README.md
index 703eb47df9..1e2b025eed 100644
--- a/README.md
+++ b/README.md
@@ -12,6 +12,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
 4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
+7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott et al.
 
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/pytorch-transformers/examples.html).
 
diff --git a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
new file mode 100644
index 0000000000..7a17ee3f1b
--- /dev/null
+++ b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert RoBERTa checkpoint."""
+
+from __future__ import absolute_import, division, print_function
+
+import argparse
+import logging
+import numpy as np
+import torch
+
+from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
+from fairseq.modules import TransformerSentenceEncoderLayer
+from pytorch_transformers.modeling_bert import (BertConfig, BertEncoder,
+                                                BertIntermediate, BertLayer,
+                                                BertModel, BertOutput,
+                                                BertSelfAttention,
+                                                BertSelfOutput)
+from pytorch_transformers.modeling_roberta import (RobertaEmbeddings,
+                                                   RobertaForMaskedLM,
+                                                   RobertaModel)
+
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+SAMPLE_TEXT = 'Hello world! cécé herlolip'
+
+
+def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path):
+    """
+    Copy/paste/tweak roberta's weights to our BERT structure.
+    """
+    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
+    roberta.eval()  # disable dropout
+    config = BertConfig(
+        vocab_size_or_config_json_file=50265,
+        hidden_size=roberta.args.encoder_embed_dim,
+        num_hidden_layers=roberta.args.encoder_layers,
+        num_attention_heads=roberta.args.encoder_attention_heads,
+        intermediate_size=roberta.args.encoder_ffn_embed_dim,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+    )
+    print("Our BERT config:", config)
+
+    model = RobertaForMaskedLM(config)
+    model.eval()
+
+    # Now let's copy all the weights.
+    # Embeddings
+    roberta_sent_encoder = roberta.model.decoder.sentence_encoder
+    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
+    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
+    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(model.roberta.embeddings.token_type_embeddings.weight)  # just zero them out b/c RoBERTa doesn't use them.
+    model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
+    model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
+    model.roberta.embeddings.LayerNorm.variance_epsilon = roberta_sent_encoder.emb_layer_norm.eps
+
+    for i in range(config.num_hidden_layers):
+        # Encoder: start of layer
+        layer: BertLayer = model.roberta.encoder.layer[i]
+        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
+
+        ### self attention
+        self_attn: BertSelfAttention = layer.attention.self
+        assert(
+            roberta_layer.self_attn.in_proj_weight.shape == torch.Size((3 * config.hidden_size, config.hidden_size))
+        )
+        # we use three distinct linear layers so we split the source layer here.
+        self_attn.query.weight.data = roberta_layer.self_attn.in_proj_weight[:config.hidden_size, :]
+        self_attn.query.bias.data = roberta_layer.self_attn.in_proj_bias[:config.hidden_size]
+        self_attn.key.weight.data = roberta_layer.self_attn.in_proj_weight[config.hidden_size:2*config.hidden_size, :]
+        self_attn.key.bias.data = roberta_layer.self_attn.in_proj_bias[config.hidden_size:2*config.hidden_size]
+        self_attn.value.weight.data = roberta_layer.self_attn.in_proj_weight[2*config.hidden_size:, :]
+        self_attn.value.bias.data = roberta_layer.self_attn.in_proj_bias[2*config.hidden_size:]
+
+        ### self-attention output
+        self_output: BertSelfOutput = layer.attention.output
+        assert(
+            self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
+        )
+        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
+        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
+        self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
+        self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
+        self_output.LayerNorm.variance_epsilon = roberta_layer.self_attn_layer_norm.eps
+
+        ### intermediate
+        intermediate: BertIntermediate = layer.intermediate
+        assert(
+            intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
+        )
+        intermediate.dense.weight = roberta_layer.fc1.weight
+        intermediate.dense.bias = roberta_layer.fc1.bias
+
+        ### output
+        bert_output: BertOutput = layer.output
+        assert(
+            bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
+        )
+        bert_output.dense.weight = roberta_layer.fc2.weight
+        bert_output.dense.bias = roberta_layer.fc2.bias
+        bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
+        bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
+        bert_output.LayerNorm.variance_epsilon = roberta_layer.final_layer_norm.eps
+        #### end of layer
+    
+    # LM Head
+    model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight
+    model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
+    model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
+    model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
+    model.lm_head.layer_norm.variance_epsilon = roberta.model.decoder.lm_head.layer_norm.eps
+    model.lm_head.weight = roberta.model.decoder.lm_head.weight
+    model.lm_head.bias = roberta.model.decoder.lm_head.bias
+
+    # Let's check that we get the same results.
+    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1
+
+    our_output = model(input_ids)[0]
+    their_output = roberta.model(input_ids)[0]
+    print(our_output.shape, their_output.shape)
+    success = torch.allclose(our_output, their_output, atol=1e-3)
+    print(
+        "Do both models output the same tensors?",
+        "🔥" if success else "💩"
+    )
+    if not success:
+        raise Exception("Something went wRoNg")
+
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ## Required parameters
+    parser.add_argument("--roberta_checkpoint_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path the official PyTorch dump.")
+    parser.add_argument("--pytorch_dump_folder_path",
+                        default = None,
+                        type = str,
+                        required = True,
+                        help = "Path to the output PyTorch model.")
+    args = parser.parse_args()
+    convert_roberta_checkpoint_to_pytorch(
+        args.roberta_checkpoint_path,
+        args.pytorch_dump_folder_path
+    )
diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py
new file mode 100644
index 0000000000..b92ffd0433
--- /dev/null
+++ b/pytorch_transformers/modeling_roberta.py
@@ -0,0 +1,128 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch RoBERTa model. """
+
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import logging
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from pytorch_transformers.modeling_bert import (BertConfig, BertEmbeddings,
+                                                BertLayerNorm, BertModel,
+                                                BertPreTrainedModel, gelu)
+
+logger = logging.getLogger(__name__)
+
+ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
+    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-pytorch_model.bin",
+    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-pytorch_model.bin",
+    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-pytorch_model.bin",
+}
+
+ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-config.json",
+    'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-config.json",
+    'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-config.json",
+}
+
+
+class RobertaEmbeddings(BertEmbeddings):
+    """
+    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
+    """
+    def __init__(self, config):
+        super(RobertaEmbeddings, self).__init__(config)
+        self.padding_idx = 1
+
+    def forward(self, input_ids, token_type_ids=None, position_ids=None):
+        seq_length = input_ids.size(1)
+        if position_ids is None:
+            # Position numbers begin at padding_idx+1. Padding symbols are ignored.
+            # cf. fairseq's `utils.make_positions`
+            position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device)
+            position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
+        return super().forward(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
+
+
+class RobertaConfig(BertConfig):
+    pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
+
+class RobertaModel(BertModel):
+    """
+    Same as BertModel with:
+    - a tiny embeddings tweak.
+    - setup for Roberta pretrained models
+    """
+    config_class = RobertaConfig
+    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "roberta"
+
+    def __init__(self, config):
+        super(RobertaModel, self).__init__(config)
+
+        self.embeddings = RobertaEmbeddings(config)
+
+
+
+class RobertaForMaskedLM(BertPreTrainedModel):
+    """
+    Roberta Model with a `language modeling` head on top.
+    """
+    config_class = RobertaConfig
+    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "roberta"
+
+    def __init__(self, config):
+        super(RobertaForMaskedLM, self).__init__(config)
+
+        self.roberta = RobertaModel(config)
+        self.lm_head = RobertaLMHead(config)
+    
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
+        outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
+        sequence_output = outputs[0]
+        prediction_scores = self.lm_head(sequence_output)
+
+        outputs = (prediction_scores,) + outputs[2:]
+        return outputs
+
+
+
+class RobertaLMHead(nn.Module):
+    """Roberta Head for masked language modeling."""
+
+    def __init__(self, config: BertConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.weight = nn.Linear(config.hidden_size, config.vocab_size, bias=False).weight
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+    def forward(self, features, **kwargs):
+        x = self.dense(features)
+        x = gelu(x)
+        x = self.layer_norm(x)
+
+        # project back to size of vocabulary with bias
+        x = F.linear(x, self.weight) + self.bias
+
+        return x
diff --git a/pytorch_transformers/tests/modeling_roberta_test.py b/pytorch_transformers/tests/modeling_roberta_test.py
new file mode 100644
index 0000000000..62707326a6
--- /dev/null
+++ b/pytorch_transformers/tests/modeling_roberta_test.py
@@ -0,0 +1,69 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import os
+import unittest
+import pytest
+import torch
+
+from pytorch_transformers.modeling_roberta import (RobertaForMaskedLM,
+                                                   RobertaModel)
+
+
+class RobertaModelTest(unittest.TestCase):
+
+    # @pytest.mark.slow
+    def test_inference_masked_lm(self):
+        model = RobertaForMaskedLM.from_pretrained('roberta-base')
+        
+        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 50265))
+        self.assertEqual(
+            output.shape,
+            expected_shape
+        )
+        # compare the actual values for a slice.
+        expected_slice = torch.Tensor(
+            [[[33.8843, -4.3107, 22.7779],
+              [ 4.6533, -2.8099, 13.6252],
+              [ 1.8222, -3.6898,  8.8600]]]
+        )
+        self.assertTrue(
+            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
+        )
+
+    # @pytest.mark.slow
+    def test_inference_no_head(self):
+        model = RobertaModel.from_pretrained('roberta-base')
+        
+        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        output = model(input_ids)[0]
+        # compare the actual values for a slice.
+        expected_slice = torch.Tensor(
+            [[[-0.0231,  0.0782,  0.0074],
+              [-0.1854,  0.0539, -0.0174],
+              [ 0.0548,  0.0799,  0.1687]]]
+        )
+        self.assertTrue(
+            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
+        )
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py
new file mode 100644
index 0000000000..01268f7d25
--- /dev/null
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -0,0 +1,42 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import os
+import unittest
+import pytest
+
+from pytorch_transformers.tokenization_roberta import RobertaTokenizer
+
+
+class RobertaTokenizationTest(unittest.TestCase):
+
+    # @pytest.mark.slow
+    def test_full_tokenizer(self):
+        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        self.assertListEqual(
+            tokenizer.encode('Hello world!'),
+            [0, 31414, 232, 328, 2]
+        )
+        self.assertListEqual(
+            tokenizer.encode('Hello world! cécé herlolip'),
+            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
+        )
+
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
new file mode 100644
index 0000000000..92717c6dd1
--- /dev/null
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -0,0 +1,218 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RoBERTa."""
+from __future__ import (absolute_import, division, print_function,
+                        unicode_literals)
+
+import json
+import logging
+import re
+
+from .tokenization_utils import PreTrainedTokenizer
+from .tokenization_gpt2 import GPT2Tokenizer
+
+logger = logging.getLogger(__name__)
+
+VOCAB_FILES_NAMES = {
+    'dict_file': 'dict.txt',
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    'dict_file':
+    {
+        'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt",
+        'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt",
+        'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    'roberta-base': 512,
+    'roberta-large': 512,
+    'roberta-large-mnli': 512,
+}
+
+
+SPACE_NORMALIZER = re.compile(r"\s+")
+
+def tokenize_line(line):
+    line = SPACE_NORMALIZER.sub(" ", line)
+    line = line.strip()
+    return line.split()
+
+
+class Dictionary(object):
+    """
+    A mapping from symbols to consecutive integers
+
+    From Facebook's fairseq.
+    """
+
+    def __init__(
+        self,
+        pad='<pad>',
+        eos='</s>',
+        unk='<unk>',
+        bos='<s>',
+        extra_special_symbols=None,
+    ):
+        self.unk_word, self.pad_word, self.eos_word = unk, pad, eos
+        self.symbols = []
+        self.count = []
+        self.indices = {}
+        self.bos_index = self.add_symbol(bos)
+        self.pad_index = self.add_symbol(pad)
+        self.eos_index = self.add_symbol(eos)
+        self.unk_index = self.add_symbol(unk)
+        if extra_special_symbols:
+            for s in extra_special_symbols:
+                self.add_symbol(s)
+        self.nspecial = len(self.symbols)
+
+    def __getitem__(self, idx):
+        if idx < len(self.symbols):
+            return self.symbols[idx]
+        return self.unk_word
+
+    def index(self, sym):
+        """Returns the index of the specified symbol"""
+        assert isinstance(sym, str)
+        if sym in self.indices:
+            return self.indices[sym]
+        return self.unk_index
+
+    def add_symbol(self, word, n=1):
+        """Adds a word to the dictionary"""
+        if word in self.indices:
+            idx = self.indices[word]
+            self.count[idx] = self.count[idx] + n
+            return idx
+        else:
+            idx = len(self.symbols)
+            self.indices[word] = idx
+            self.symbols.append(word)
+            self.count.append(n)
+            return idx
+
+    @classmethod
+    def load(cls, f, ignore_utf_errors=False):
+        """Loads the dictionary from a text file with the format:
+
+        ```
+        <symbol0> <count0>
+        <symbol1> <count1>
+        ...
+        ```
+        """
+        d = cls()
+        d.add_from_file(f, ignore_utf_errors)
+        return d
+
+    def add_from_file(self, f, ignore_utf_errors=False):
+        """
+        Loads a pre-existing dictionary from a text file and adds its symbols
+        to this instance.
+        """
+        if isinstance(f, str):
+            try:
+                if not ignore_utf_errors:
+                    with open(f, 'r', encoding='utf-8') as fd:
+                        self.add_from_file(fd)
+                else:
+                    with open(f, 'r', encoding='utf-8', errors='ignore') as fd:
+                        self.add_from_file(fd)
+            except FileNotFoundError as fnfe:
+                raise fnfe
+            except UnicodeError:
+                raise Exception("Incorrect encoding detected in {}, please "
+                                "rebuild the dataset".format(f))
+            return
+
+        lines = f.readlines()
+        for line in lines:
+            idx = line.rfind(' ')
+            if idx == -1:
+                raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
+            word = line[:idx]
+            count = int(line[idx + 1:])
+            self.indices[word] = len(self.symbols)
+            self.symbols.append(word)
+            self.count.append(count)
+    
+    def encode_line(self, line, line_tokenizer=tokenize_line, add_if_not_exist=True,
+                    consumer=None, append_eos=True, reverse_order=False):
+        words = line_tokenizer(line)
+        if reverse_order:
+            words = list(reversed(words))
+        nwords = len(words)
+        ids = [0] * (nwords + 1 if append_eos else nwords)
+
+        for i, word in enumerate(words):
+            if add_if_not_exist:
+                idx = self.add_symbol(word)
+            else:
+                idx = self.index(word)
+            if consumer is not None:
+                consumer(word, idx)
+            ids[i] = idx
+        if append_eos:
+            ids[nwords] = self.eos_index
+        return ids
+
+
+
+
+class RobertaTokenizer(PreTrainedTokenizer):
+    """
+    RoBERTa tokenizer. Peculiarities:
+        - GPT-2 tokenizer with a different integer mapping on top.
+    """
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+
+    def __init__(self, dict_file,
+                 bos_token="<s>", eos_token="</s>", **kwargs):
+        super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, **kwargs)
+
+        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
+        self.dictionary = Dictionary.load(dict_file)
+
+    def _tokenize(self, text):
+        """ Use GPT-2 Tokenizer """
+        return self.gpt2_tokenizer._tokenize(text)
+
+    def encode(self, text):
+        """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
+        """
+        gpt2_tokens_joined = " ".join(
+            str(x) for x in self.gpt2_tokenizer.convert_tokens_to_ids(self.tokenize(text))
+        )
+        bpe_sentence = '<s> ' + gpt2_tokens_joined + ' </s>'
+        return self.dictionary.encode_line(bpe_sentence, append_eos=False)
+
+    def _convert_token_to_id(self, token):
+        return self.dictionary.index(token)
+
+    def _convert_id_to_token(self, index):
+        symbol = self.dictionary[index]
+        try:
+            idx = int(symbol)
+            return self.gpt2_tokenizer._convert_id_to_token(idx)
+        except:
+            return symbol
+
+    def convert_tokens_to_string(self, tokens):
+        return self.gpt2_tokenizer.convert_tokens_to_string(tokens)

From cb9db101c744276a5028f5b8c675c35536f2096f Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Sun, 4 Aug 2019 22:04:15 -0400
Subject: [PATCH 03/45] Python 2 must DIE

---
 pytorch_transformers/modeling_roberta.py               |  6 +++---
 .../tests/tokenization_roberta_test.py                 | 10 ++++++----
 pytorch_transformers/tokenization_roberta.py           |  4 +++-
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py
index b92ffd0433..109a719616 100644
--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -58,7 +58,7 @@ class RobertaEmbeddings(BertEmbeddings):
             # cf. fairseq's `utils.make_positions`
             position_ids = torch.arange(self.padding_idx+1, seq_length+self.padding_idx+1, dtype=torch.long, device=input_ids.device)
             position_ids = position_ids.unsqueeze(0).expand_as(input_ids)
-        return super().forward(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
+        return super(RobertaEmbeddings, self).forward(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)
 
 
 class RobertaConfig(BertConfig):
@@ -109,8 +109,8 @@ class RobertaForMaskedLM(BertPreTrainedModel):
 class RobertaLMHead(nn.Module):
     """Roberta Head for masked language modeling."""
 
-    def __init__(self, config: BertConfig):
-        super().__init__()
+    def __init__(self, config):
+        super(RobertaLMHead, self).__init__()
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py
index 01268f7d25..cd4e17ec34 100644
--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -18,6 +18,7 @@ from __future__ import (absolute_import, division, print_function,
 import os
 import unittest
 import pytest
+import six
 
 from pytorch_transformers.tokenization_roberta import RobertaTokenizer
 
@@ -31,10 +32,11 @@ class RobertaTokenizationTest(unittest.TestCase):
             tokenizer.encode('Hello world!'),
             [0, 31414, 232, 328, 2]
         )
-        self.assertListEqual(
-            tokenizer.encode('Hello world! cécé herlolip'),
-            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
-        )
+        if six.PY3:
+            self.assertListEqual(
+                tokenizer.encode('Hello world! cécé herlolip'),
+                [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
+            )
 
 
 
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index 92717c6dd1..4f9a7bc0fa 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -19,6 +19,8 @@ from __future__ import (absolute_import, division, print_function,
 import json
 import logging
 import re
+from io import open
+import six
 
 from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer
@@ -125,7 +127,7 @@ class Dictionary(object):
         Loads a pre-existing dictionary from a text file and adds its symbols
         to this instance.
         """
-        if isinstance(f, str):
+        if isinstance(f, six.string_types):
             try:
                 if not ignore_utf_errors:
                     with open(f, 'r', encoding='utf-8') as fd:

From 770043eea2927eea1664fdd56b3996a8fb41731c Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 7 Aug 2019 12:53:19 -0400
Subject: [PATCH 04/45] Sentence-pair tasks handling. Using common tests on
 RoBERTa. Forced push to fix indentation.

---
 pytorch_transformers/__init__.py              |   3 +
 pytorch_transformers/modeling_roberta.py      |  28 ++-
 .../tests/modeling_roberta_test.py            | 200 ++++++++++++++----
 .../tests/tokenization_roberta_test.py        |  45 ++--
 pytorch_transformers/tokenization_roberta.py  |  90 ++++++--
 5 files changed, 279 insertions(+), 87 deletions(-)

diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index b4b957192c..d1e42b130a 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -5,6 +5,7 @@ from .tokenization_transfo_xl import (TransfoXLTokenizer, TransfoXLCorpus)
 from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
+from .tokenization_roberta import RobertaTokenizer
 from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization)
 
 from .modeling_bert import (BertConfig, BertPreTrainedModel, BertModel, BertForPreTraining,
@@ -33,6 +34,8 @@ from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
                            XLMWithLMHeadModel, XLMForSequenceClassification,
                            XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
+from .modeling_roberta import (RobertaConfig, RobertaForMaskedLM, RobertaModel,
+                               ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
                           PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
 
diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py
index 109a719616..43f76989f4 100644
--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -23,6 +23,7 @@ import logging
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from torch.nn import CrossEntropyLoss
 
 from pytorch_transformers.modeling_bert import (BertConfig, BertEmbeddings,
                                                 BertLayerNorm, BertModel,
@@ -78,7 +79,7 @@ class RobertaModel(BertModel):
         super(RobertaModel, self).__init__(config)
 
         self.embeddings = RobertaEmbeddings(config)
-
+        self.apply(self.init_weights)
 
 
 class RobertaForMaskedLM(BertPreTrainedModel):
@@ -94,16 +95,31 @@ class RobertaForMaskedLM(BertPreTrainedModel):
 
         self.roberta = RobertaModel(config)
         self.lm_head = RobertaLMHead(config)
-    
-    def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
+
+        self.apply(self.init_weights)
+        self.tie_weights()
+
+    def tie_weights(self):
+        """ Make sure we are sharing the input and output embeddings.
+            Export to TorchScript can't handle parameter sharing so we are cloning them instead.
+        """
+        self._tie_or_clone_weights(self.lm_head.decoder, self.roberta.embeddings.word_embeddings)
+
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, masked_lm_labels=None, position_ids=None,
+                head_mask=None):
         outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
                             attention_mask=attention_mask, head_mask=head_mask)
         sequence_output = outputs[0]
         prediction_scores = self.lm_head(sequence_output)
 
         outputs = (prediction_scores,) + outputs[2:]
-        return outputs
 
+        if masked_lm_labels is not None:
+            loss_fct = CrossEntropyLoss(ignore_index=-1)
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
+            outputs = (masked_lm_loss,) + outputs
+
+        return outputs
 
 
 class RobertaLMHead(nn.Module):
@@ -114,7 +130,7 @@ class RobertaLMHead(nn.Module):
         self.dense = nn.Linear(config.hidden_size, config.hidden_size)
         self.layer_norm = BertLayerNorm(config.hidden_size, eps=config.layer_norm_eps)
 
-        self.weight = nn.Linear(config.hidden_size, config.vocab_size, bias=False).weight
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
         self.bias = nn.Parameter(torch.zeros(config.vocab_size))
 
     def forward(self, features, **kwargs):
@@ -123,6 +139,6 @@ class RobertaLMHead(nn.Module):
         x = self.layer_norm(x)
 
         # project back to size of vocabulary with bias
-        x = F.linear(x, self.weight) + self.bias
+        x = self.decoder(x) + self.bias
 
         return x
diff --git a/pytorch_transformers/tests/modeling_roberta_test.py b/pytorch_transformers/tests/modeling_roberta_test.py
index 62707326a6..273176b27a 100644
--- a/pytorch_transformers/tests/modeling_roberta_test.py
+++ b/pytorch_transformers/tests/modeling_roberta_test.py
@@ -12,58 +12,172 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
 
-import os
 import unittest
+import shutil
 import pytest
-import torch
 
-from pytorch_transformers.modeling_roberta import (RobertaForMaskedLM,
-                                                   RobertaModel)
+from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM)
+from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+
+from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
 
 
-class RobertaModelTest(unittest.TestCase):
+class RobertaModelTest(CommonTestCases.CommonModelTester):
 
-    # @pytest.mark.slow
-    def test_inference_masked_lm(self):
-        model = RobertaForMaskedLM.from_pretrained('roberta-base')
-        
-        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
-        output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(
-            output.shape,
-            expected_shape
-        )
-        # compare the actual values for a slice.
-        expected_slice = torch.Tensor(
-            [[[33.8843, -4.3107, 22.7779],
-              [ 4.6533, -2.8099, 13.6252],
-              [ 1.8222, -3.6898,  8.8600]]]
-        )
-        self.assertTrue(
-            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
-        )
+    all_model_classes = (RobertaForMaskedLM, RobertaModel)
 
-    # @pytest.mark.slow
-    def test_inference_no_head(self):
-        model = RobertaModel.from_pretrained('roberta-base')
-        
-        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
-        output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = torch.Tensor(
-            [[[-0.0231,  0.0782,  0.0074],
-              [-0.1854,  0.0539, -0.0174],
-              [ 0.0548,  0.0799,  0.1687]]]
-        )
-        self.assertTrue(
-            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
-        )
+    class RobertaModelTester(object):
 
+        def __init__(self,
+                     parent,
+                     batch_size=13,
+                     seq_length=7,
+                     is_training=True,
+                     use_input_mask=True,
+                     use_token_type_ids=True,
+                     use_labels=True,
+                     vocab_size=99,
+                     hidden_size=32,
+                     num_hidden_layers=5,
+                     num_attention_heads=4,
+                     intermediate_size=37,
+                     hidden_act="gelu",
+                     hidden_dropout_prob=0.1,
+                     attention_probs_dropout_prob=0.1,
+                     max_position_embeddings=512,
+                     type_vocab_size=16,
+                     type_sequence_label_size=2,
+                     initializer_range=0.02,
+                     num_labels=3,
+                     num_choices=4,
+                     scope=None,
+                    ):
+            self.parent = parent
+            self.batch_size = batch_size
+            self.seq_length = seq_length
+            self.is_training = is_training
+            self.use_input_mask = use_input_mask
+            self.use_token_type_ids = use_token_type_ids
+            self.use_labels = use_labels
+            self.vocab_size = vocab_size
+            self.hidden_size = hidden_size
+            self.num_hidden_layers = num_hidden_layers
+            self.num_attention_heads = num_attention_heads
+            self.intermediate_size = intermediate_size
+            self.hidden_act = hidden_act
+            self.hidden_dropout_prob = hidden_dropout_prob
+            self.attention_probs_dropout_prob = attention_probs_dropout_prob
+            self.max_position_embeddings = max_position_embeddings
+            self.type_vocab_size = type_vocab_size
+            self.type_sequence_label_size = type_sequence_label_size
+            self.initializer_range = initializer_range
+            self.num_labels = num_labels
+            self.num_choices = num_choices
+            self.scope = scope
 
+        def prepare_config_and_inputs(self):
+            input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
 
-if __name__ == '__main__':
+            input_mask = None
+            if self.use_input_mask:
+                input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+            token_type_ids = None
+            if self.use_token_type_ids:
+                token_type_ids = ids_tensor([self.batch_size, self.seq_length], self.type_vocab_size)
+
+            sequence_labels = None
+            token_labels = None
+            choice_labels = None
+            if self.use_labels:
+                sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+                token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+                choice_labels = ids_tensor([self.batch_size], self.num_choices)
+
+            config = RobertaConfig(
+                vocab_size_or_config_json_file=self.vocab_size,
+                hidden_size=self.hidden_size,
+                num_hidden_layers=self.num_hidden_layers,
+                num_attention_heads=self.num_attention_heads,
+                intermediate_size=self.intermediate_size,
+                hidden_act=self.hidden_act,
+                hidden_dropout_prob=self.hidden_dropout_prob,
+                attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+                max_position_embeddings=self.max_position_embeddings,
+                type_vocab_size=self.type_vocab_size,
+                initializer_range=self.initializer_range)
+
+            return config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels
+
+        def check_loss_output(self, result):
+            self.parent.assertListEqual(
+                list(result["loss"].size()),
+                [])
+
+        def create_and_check_roberta_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
+                                           token_labels, choice_labels):
+            model = RobertaModel(config=config)
+            model.eval()
+            sequence_output, pooled_output = model(input_ids, token_type_ids, input_mask)
+            sequence_output, pooled_output = model(input_ids, token_type_ids)
+            sequence_output, pooled_output = model(input_ids)
+
+            result = {
+                "sequence_output": sequence_output,
+                "pooled_output": pooled_output,
+            }
+            self.parent.assertListEqual(
+                list(result["sequence_output"].size()),
+                [self.batch_size, self.seq_length, self.hidden_size])
+            self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
+
+        def create_and_check_roberta_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels,
+                                                   token_labels, choice_labels):
+            model = RobertaForMaskedLM(config=config)
+            model.eval()
+            loss, prediction_scores = model(input_ids, token_type_ids, input_mask, token_labels)
+            result = {
+                "loss": loss,
+                "prediction_scores": prediction_scores,
+            }
+            self.parent.assertListEqual(
+                list(result["prediction_scores"].size()),
+                [self.batch_size, self.seq_length, self.vocab_size])
+            self.check_loss_output(result)
+
+        def prepare_config_and_inputs_for_common(self):
+            config_and_inputs = self.prepare_config_and_inputs()
+            (config, input_ids, token_type_ids, input_mask,
+             sequence_labels, token_labels, choice_labels) = config_and_inputs
+            inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
+            return config, inputs_dict
+
+    def setUp(self):
+        self.model_tester = RobertaModelTest.RobertaModelTester(self)
+        self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
+
+    def test_config(self):
+        self.config_tester.run_common_tests()
+
+    def test_roberta_model(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_roberta_model(*config_and_inputs)
+
+    def test_for_masked_lm(self):
+        config_and_inputs = self.model_tester.prepare_config_and_inputs()
+        self.model_tester.create_and_check_roberta_for_masked_lm(*config_and_inputs)
+
+    @pytest.mark.slow
+    def test_model_from_pretrained(self):
+        cache_dir = "/tmp/pytorch_transformers_test/"
+        for model_name in list(ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP.keys())[:1]:
+            model = RobertaModel.from_pretrained(model_name, cache_dir=cache_dir)
+            shutil.rmtree(cache_dir)
+            self.assertIsNotNone(model)
+
+if __name__ == "__main__":
     unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py
index cd4e17ec34..60df18ae2b 100644
--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -12,32 +12,45 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from __future__ import (absolute_import, division, print_function,
-                        unicode_literals)
+from __future__ import absolute_import, division, print_function, unicode_literals
 
 import os
 import unittest
-import pytest
-import six
 
-from pytorch_transformers.tokenization_roberta import RobertaTokenizer
+from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
+from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
 
 
 class RobertaTokenizationTest(unittest.TestCase):
 
-    # @pytest.mark.slow
     def test_full_tokenizer(self):
-        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
-        self.assertListEqual(
-            tokenizer.encode('Hello world!'),
-            [0, 31414, 232, 328, 2]
-        )
-        if six.PY3:
-            self.assertListEqual(
-                tokenizer.encode('Hello world! cécé herlolip'),
-                [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
-            )
+        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+        vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
+                 "lo", "low", "er",
+                 "low", "lowest", "newer", "wider", "<unk>"]
+        vocab_tokens = dict(zip(vocab, range(len(vocab))))
+        special_tokens_map = {"unk_token": "<unk>"}
 
+        with TemporaryDirectory() as tmpdirname:
+            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+            with open(vocab_file, "w") as fp:
+                [fp.write(f"{vocab} {index}\n") for index, vocab in enumerate(vocab_tokens)]
+
+            input_text = u"lower newer"
+            output_text = u"lower<unk>newer"
+
+            create_and_check_tokenizer_commons(self, input_text, output_text, RobertaTokenizer, tmpdirname, **special_tokens_map)
+
+            tokenizer = RobertaTokenizer(vocab_file, **special_tokens_map)
+            text = "lower"
+            bpe_tokens = ["low", "er"]
+            tokens = tokenizer.tokenize(text)
+            self.assertListEqual(tokens, bpe_tokens)
+
+            input_tokens = tokens + [tokenizer.unk_token]
+            input_bpe_tokens = [13, 12, 17]
+            self.assertListEqual(
+                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
 
 if __name__ == '__main__':
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index 4f9a7bc0fa..7fa42bfb1c 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -22,22 +22,22 @@ import re
 from io import open
 import six
 
-from .tokenization_utils import PreTrainedTokenizer
+from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
 from .tokenization_gpt2 import GPT2Tokenizer
 
 logger = logging.getLogger(__name__)
 
 VOCAB_FILES_NAMES = {
-    'dict_file': 'dict.txt',
+    'vocab_file': 'dict.txt',
 }
 
 PRETRAINED_VOCAB_FILES_MAP = {
-    'dict_file':
-    {
-        'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt",
-        'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt",
-        'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt",
-    },
+    'vocab_file':
+        {
+            'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt",
+            'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt",
+            'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt",
+        },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
@@ -46,7 +46,6 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
     'roberta-large-mnli': 512,
 }
 
-
 SPACE_NORMALIZER = re.compile(r"\s+")
 
 def tokenize_line(line):
@@ -142,7 +141,7 @@ class Dictionary(object):
                                 "rebuild the dataset".format(f))
             return
 
-        lines = f.readlines()
+        lines = f.read().splitlines()
         for line in lines:
             idx = line.rfind(' ')
             if idx == -1:
@@ -152,7 +151,7 @@ class Dictionary(object):
             self.indices[word] = len(self.symbols)
             self.symbols.append(word)
             self.count.append(count)
-    
+
     def encode_line(self, line, line_tokenizer=tokenize_line, add_if_not_exist=True,
                     consumer=None, append_eos=True, reverse_order=False):
         words = line_tokenizer(line)
@@ -174,8 +173,6 @@ class Dictionary(object):
         return ids
 
 
-
-
 class RobertaTokenizer(PreTrainedTokenizer):
     """
     RoBERTa tokenizer. Peculiarities:
@@ -185,25 +182,53 @@ class RobertaTokenizer(PreTrainedTokenizer):
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, dict_file,
+    def __init__(self, vocab_file,
                  bos_token="<s>", eos_token="</s>", **kwargs):
-        super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, **kwargs)
+        super(RobertaTokenizer, self).__init__(cls_token=bos_token, sep_token=eos_token, eos_token=eos_token, **kwargs)
 
         self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        self.dictionary = Dictionary.load(dict_file)
+        self.dictionary = Dictionary.load(vocab_file)
 
     def _tokenize(self, text):
         """ Use GPT-2 Tokenizer """
         return self.gpt2_tokenizer._tokenize(text)
 
-    def encode(self, text):
+    def encode(self, text, *args):
         """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
         """
-        gpt2_tokens_joined = " ".join(
-            str(x) for x in self.gpt2_tokenizer.convert_tokens_to_ids(self.tokenize(text))
-        )
-        bpe_sentence = '<s> ' + gpt2_tokens_joined + ' </s>'
-        return self.dictionary.encode_line(bpe_sentence, append_eos=False)
+        bpe_sentence = [self.cls_token] + \
+                       self.gpt2_tokenizer.convert_tokens_to_ids(self.tokenize(text)) + \
+                       [self.sep_token]
+
+        if len(args):
+            for additional_sentence in args:
+                bpe_sentence += [self.sep_token
+                                 ] + \
+                                self.gpt2_tokenizer.convert_tokens_to_ids(self.tokenize(additional_sentence)) + \
+                                [self.sep_token]
+
+        return self.dictionary.encode_line(' '.join([str(token) for token in bpe_sentence]), append_eos=False)
+
+    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+        """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
+            with options to remove special tokens and clean up tokenization spaces.
+            Handles sentence pairs.
+        """
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+
+        if any(isinstance(element, list) for element in filtered_tokens):
+            texts = []
+            for element in filtered_tokens:
+                text = self.convert_tokens_to_string(element)
+                if clean_up_tokenization_spaces:
+                    text = clean_up_tokenization(text)
+                    texts.append(text)
+            return texts
+        else:
+            text = self.convert_tokens_to_string(filtered_tokens)
+            if clean_up_tokenization_spaces:
+                text = clean_up_tokenization(text)
+            return text
 
     def _convert_token_to_id(self, token):
         return self.dictionary.index(token)
@@ -218,3 +243,24 @@ class RobertaTokenizer(PreTrainedTokenizer):
 
     def convert_tokens_to_string(self, tokens):
         return self.gpt2_tokenizer.convert_tokens_to_string(tokens)
+
+    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
+        # Remove the first and last tokens which are cls and sep tokens
+        ids = ids[1:-1]
+        # If multi sentence, then split (multi sentence found by looking for two sequential sep tokens)
+        ids = [list(map(int, example.split(' '))) for example in ' '.join([str(id) for id in ids]).split(' 2 2 ')]
+
+        if len(ids) == 1:
+            tokens = self.gpt2_tokenizer.convert_ids_to_tokens(list(map(lambda id: int(self.dictionary[id]), ids[0])))
+        else:
+            tokens = []
+            for example in ids:
+                tokens += [
+                    self.gpt2_tokenizer.convert_ids_to_tokens(list(map(lambda id: int(self.dictionary[id]), example)))]
+        return tokens
+
+    def convert_tokens_to_ids(self, tokens):
+        tokens = " ".join(str(x) for x in self.gpt2_tokenizer.convert_tokens_to_ids(tokens))
+        bpe_sentence = '<s> ' + tokens + ' </s>'
+        return self.dictionary.encode_line(bpe_sentence, append_eos=False)
+

From 39d72bcc7b2c99c04b6f483f0d8e7bdff547d37c Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 7 Aug 2019 14:21:57 -0400
Subject: [PATCH 05/45] Fixed the RoBERTa checkpoint conversion script
 according to the LM head refactoring.

---
 pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
index 7a17ee3f1b..f21afa29ed 100644
--- a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
@@ -123,7 +123,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
     model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
     model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
     model.lm_head.layer_norm.variance_epsilon = roberta.model.decoder.lm_head.layer_norm.eps
-    model.lm_head.weight = roberta.model.decoder.lm_head.weight
+    model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
     model.lm_head.bias = roberta.model.decoder.lm_head.bias
 
     # Let's check that we get the same results.

From 9d0603148bc34255fad0cad73ce438ecd7306322 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Thu, 8 Aug 2019 11:24:54 -0400
Subject: [PATCH 06/45] [RoBERTa] RobertaForSequenceClassification + conversion

---
 .../convert_roberta_checkpoint_to_pytorch.py  | 36 ++++++++----
 pytorch_transformers/modeling_roberta.py      | 57 ++++++++++++++++++
 .../tests/modeling_roberta_test.py            | 58 +++++++++++++++++++
 3 files changed, 140 insertions(+), 11 deletions(-)

diff --git a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
index f21afa29ed..85ad5ad15b 100644
--- a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
@@ -30,6 +30,7 @@ from pytorch_transformers.modeling_bert import (BertConfig, BertEncoder,
                                                 BertSelfOutput)
 from pytorch_transformers.modeling_roberta import (RobertaEmbeddings,
                                                    RobertaForMaskedLM,
+                                                   RobertaForSequenceClassification,
                                                    RobertaModel)
 
 logging.basicConfig(level=logging.INFO)
@@ -38,7 +39,7 @@ logger = logging.getLogger(__name__)
 SAMPLE_TEXT = 'Hello world! cécé herlolip'
 
 
-def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path):
+def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_folder_path, classification_head):
     """
     Copy/paste/tweak roberta's weights to our BERT structure.
     """
@@ -53,9 +54,11 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         max_position_embeddings=514,
         type_vocab_size=1,
     )
+    if classification_head:
+        config.num_labels = roberta.args.num_classes
     print("Our BERT config:", config)
 
-    model = RobertaForMaskedLM(config)
+    model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config)
     model.eval()
 
     # Now let's copy all the weights.
@@ -117,14 +120,20 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         bert_output.LayerNorm.variance_epsilon = roberta_layer.final_layer_norm.eps
         #### end of layer
     
-    # LM Head
-    model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight
-    model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
-    model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
-    model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
-    model.lm_head.layer_norm.variance_epsilon = roberta.model.decoder.lm_head.layer_norm.eps
-    model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
-    model.lm_head.bias = roberta.model.decoder.lm_head.bias
+    if classification_head:
+        model.classifier.dense.weight = roberta.model.classification_heads['mnli'].dense.weight
+        model.classifier.dense.bias = roberta.model.classification_heads['mnli'].dense.bias
+        model.classifier.out_proj.weight = roberta.model.classification_heads['mnli'].out_proj.weight
+        model.classifier.out_proj.bias = roberta.model.classification_heads['mnli'].out_proj.bias
+    else:
+        # LM Head
+        model.lm_head.dense.weight = roberta.model.decoder.lm_head.dense.weight
+        model.lm_head.dense.bias = roberta.model.decoder.lm_head.dense.bias
+        model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
+        model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
+        model.lm_head.layer_norm.variance_epsilon = roberta.model.decoder.lm_head.layer_norm.eps
+        model.lm_head.weight = roberta.model.decoder.lm_head.weight
+        model.lm_head.bias = roberta.model.decoder.lm_head.bias
 
     # Let's check that we get the same results.
     input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1
@@ -157,8 +166,13 @@ if __name__ == "__main__":
                         type = str,
                         required = True,
                         help = "Path to the output PyTorch model.")
+    parser.add_argument("--classification_head",
+                        action = "store_true",
+                        help = "Whether to convert a final classification head.")
     args = parser.parse_args()
     convert_roberta_checkpoint_to_pytorch(
         args.roberta_checkpoint_path,
-        args.pytorch_dump_folder_path
+        args.pytorch_dump_folder_path,
+        args.classification_head
     )
+
diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py
index 43f76989f4..43c9362b30 100644
--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -142,3 +142,60 @@ class RobertaLMHead(nn.Module):
         x = self.decoder(x) + self.bias
 
         return x
+
+
+
+class RobertaForSequenceClassification(BertPreTrainedModel):
+    """
+    Roberta Model with a classifier head on top.
+    """
+    config_class = RobertaConfig
+    pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
+    base_model_prefix = "roberta"
+
+    def __init__(self, config):
+        super(RobertaForSequenceClassification, self).__init__(config)
+        self.num_labels = config.num_labels
+
+        self.roberta = RobertaModel(config)
+        self.classifier = RobertaClassificationHead(config)
+    
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None,
+                position_ids=None, head_mask=None):
+        outputs = self.roberta(input_ids, position_ids=position_ids, token_type_ids=token_type_ids,
+                            attention_mask=attention_mask, head_mask=head_mask)
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        outputs = (logits,) + outputs[2:]
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            outputs = (loss,) + outputs
+
+        return outputs  # (loss), logits, (hidden_states), (attentions)
+
+
+
+class RobertaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super(RobertaClassificationHead, self).__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
diff --git a/pytorch_transformers/tests/modeling_roberta_test.py b/pytorch_transformers/tests/modeling_roberta_test.py
index 273176b27a..36145466b9 100644
--- a/pytorch_transformers/tests/modeling_roberta_test.py
+++ b/pytorch_transformers/tests/modeling_roberta_test.py
@@ -179,5 +179,63 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
             shutil.rmtree(cache_dir)
             self.assertIsNotNone(model)
 
+
+
+class RobertaModelIntegrationTest(unittest.TestCase):
+
+    @pytest.mark.slow
+    def test_inference_masked_lm(self):
+        model = RobertaForMaskedLM.from_pretrained('roberta-base')
+        
+        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 11, 50265))
+        self.assertEqual(
+            output.shape,
+            expected_shape
+        )
+        # compare the actual values for a slice.
+        expected_slice = torch.Tensor(
+            [[[33.8843, -4.3107, 22.7779],
+              [ 4.6533, -2.8099, 13.6252],
+              [ 1.8222, -3.6898,  8.8600]]]
+        )
+        self.assertTrue(
+            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
+        )
+
+    @pytest.mark.slow
+    def test_inference_no_head(self):
+        model = RobertaModel.from_pretrained('roberta-base')
+        
+        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        output = model(input_ids)[0]
+        # compare the actual values for a slice.
+        expected_slice = torch.Tensor(
+            [[[-0.0231,  0.0782,  0.0074],
+              [-0.1854,  0.0539, -0.0174],
+              [ 0.0548,  0.0799,  0.1687]]]
+        )
+        self.assertTrue(
+            torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
+        )
+
+    @pytest.mark.slow
+    def test_inference_classification_head(self):
+        model = RobertaForSequenceClassification.from_pretrained('roberta-large-mnli')
+        
+        input_ids = torch.tensor([[    0, 31414,   232,   328,   740,  1140, 12695,    69, 46078,  1588,   2]])
+        output = model(input_ids)[0]
+        expected_shape = torch.Size((1, 3))
+        self.assertEqual(
+            output.shape,
+            expected_shape
+        )
+        expected_tensor = torch.Tensor([[-0.9469,  0.3913,  0.5118]])
+        self.assertTrue(
+            torch.allclose(output, expected_tensor, atol=1e-3)
+        )
+
+
 if __name__ == "__main__":
     unittest.main()

From e367ac469c27949854a08c5c5ba5b392c3fbcb0a Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Thu, 8 Aug 2019 11:26:11 -0400
Subject: [PATCH 07/45] [RoBERTa] Re-apply
 39d72bcc7b2c99c04b6f483f0d8e7bdff547d37c

cc @lysandrejik
---
 pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
index 85ad5ad15b..e4e8fbb25d 100644
--- a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
@@ -132,7 +132,7 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
         model.lm_head.layer_norm.weight = roberta.model.decoder.lm_head.layer_norm.weight
         model.lm_head.layer_norm.bias = roberta.model.decoder.lm_head.layer_norm.bias
         model.lm_head.layer_norm.variance_epsilon = roberta.model.decoder.lm_head.layer_norm.eps
-        model.lm_head.weight = roberta.model.decoder.lm_head.weight
+        model.lm_head.decoder.weight = roberta.model.decoder.lm_head.weight
         model.lm_head.bias = roberta.model.decoder.lm_head.bias
 
     # Let's check that we get the same results.

From 6c41a8f5dc5c630f31fda7b1617b701b40ea27d6 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 8 Aug 2019 18:20:32 -0400
Subject: [PATCH 08/45] Encode and Decode are back in the superclass. They now
 handle sentence pairs special tokens.

---
 pytorch_transformers/__init__.py             |   3 +-
 pytorch_transformers/modeling_roberta.py     |   3 +-
 pytorch_transformers/tokenization_roberta.py | 108 +++++++------------
 pytorch_transformers/tokenization_utils.py   |  46 ++++++--
 4 files changed, 81 insertions(+), 79 deletions(-)

diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index c4148e283c..38423de14b 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -7,7 +7,6 @@ from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_xlnet import XLNetTokenizer, SPIECE_UNDERLINE
 from .tokenization_xlm import XLMTokenizer
 from .tokenization_roberta import RobertaTokenizer
-from .tokenization_utils import (PreTrainedTokenizer, clean_up_tokenization)
 
 from .tokenization_utils import (PreTrainedTokenizer)
 
@@ -39,7 +38,7 @@ from .modeling_xlm import (XLMConfig, XLMPreTrainedModel , XLMModel,
                            XLMWithLMHeadModel, XLMForSequenceClassification,
                            XLMForQuestionAnswering, XLM_PRETRAINED_CONFIG_ARCHIVE_MAP,
                            XLM_PRETRAINED_MODEL_ARCHIVE_MAP)
-from .modeling_roberta import (RobertaConfig, RobertaForMaskedLM, RobertaModel,
+from .modeling_roberta import (RobertaConfig, RobertaForMaskedLM, RobertaModel, RobertaForSequenceClassification,
                                ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP)
 from .modeling_utils import (WEIGHTS_NAME, CONFIG_NAME, TF_WEIGHTS_NAME,
                           PretrainedConfig, PreTrainedModel, prune_layer, Conv1D)
diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py
index 43c9362b30..6cd4bc2d35 100644
--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -23,7 +23,7 @@ import logging
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from torch.nn import CrossEntropyLoss
+from torch.nn import CrossEntropyLoss, MSELoss
 
 from pytorch_transformers.modeling_bert import (BertConfig, BertEmbeddings,
                                                 BertLayerNorm, BertModel,
@@ -144,7 +144,6 @@ class RobertaLMHead(nn.Module):
         return x
 
 
-
 class RobertaForSequenceClassification(BertPreTrainedModel):
     """
     Roberta Model with a classifier head on top.
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index 7fa42bfb1c..4ec53a65b0 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -21,18 +21,19 @@ import logging
 import re
 from io import open
 import six
+import os
 
-from .tokenization_utils import PreTrainedTokenizer, clean_up_tokenization
+from .tokenization_utils import PreTrainedTokenizer
 from .tokenization_gpt2 import GPT2Tokenizer
 
 logger = logging.getLogger(__name__)
 
-VOCAB_FILES_NAMES = {
-    'vocab_file': 'dict.txt',
+DICT_FILES_NAMES = {
+    'dict_file': 'dict.txt',
 }
 
-PRETRAINED_VOCAB_FILES_MAP = {
-    'vocab_file':
+PRETRAINED_DICT_FILES_MAP = {
+    'dict_file':
         {
             'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt",
             'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt",
@@ -178,89 +179,62 @@ class RobertaTokenizer(PreTrainedTokenizer):
     RoBERTa tokenizer. Peculiarities:
         - GPT-2 tokenizer with a different integer mapping on top.
     """
-    vocab_files_names = VOCAB_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    vocab_files_names = DICT_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_DICT_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, vocab_file,
-                 bos_token="<s>", eos_token="</s>", **kwargs):
-        super(RobertaTokenizer, self).__init__(cls_token=bos_token, sep_token=eos_token, eos_token=eos_token, **kwargs)
+    def __init__(self, dict_file, bpe_tokenizer=None, bos_token="<s>", eos_token="</s>", sep_token="</s>", cls_token="<s>",
+                 unk_token="<unk>", **kwargs):
+        super(RobertaTokenizer, self).__init__(cls_token=bos_token, sep_token=eos_token, eos_token=eos_token,
+                                               unk_token=unk_token, **kwargs)
 
-        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
-        self.dictionary = Dictionary.load(vocab_file)
+        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") if bpe_tokenizer is None else bpe_tokenizer
+        self.dictionary = Dictionary.load(dict_file)
+
+    @property
+    def vocab_size(self):
+        return len(self.dictionary.indices)
 
     def _tokenize(self, text):
         """ Use GPT-2 Tokenizer """
         return self.gpt2_tokenizer._tokenize(text)
 
-    def encode(self, text, *args):
-        """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
-        """
-        bpe_sentence = [self.cls_token] + \
-                       self.gpt2_tokenizer.convert_tokens_to_ids(self.tokenize(text)) + \
-                       [self.sep_token]
-
-        if len(args):
-            for additional_sentence in args:
-                bpe_sentence += [self.sep_token
-                                 ] + \
-                                self.gpt2_tokenizer.convert_tokens_to_ids(self.tokenize(additional_sentence)) + \
-                                [self.sep_token]
-
-        return self.dictionary.encode_line(' '.join([str(token) for token in bpe_sentence]), append_eos=False)
-
-    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
-        """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
-            with options to remove special tokens and clean up tokenization spaces.
-            Handles sentence pairs.
-        """
-        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-
-        if any(isinstance(element, list) for element in filtered_tokens):
-            texts = []
-            for element in filtered_tokens:
-                text = self.convert_tokens_to_string(element)
-                if clean_up_tokenization_spaces:
-                    text = clean_up_tokenization(text)
-                    texts.append(text)
-            return texts
-        else:
-            text = self.convert_tokens_to_string(filtered_tokens)
-            if clean_up_tokenization_spaces:
-                text = clean_up_tokenization(text)
-            return text
-
     def _convert_token_to_id(self, token):
-        return self.dictionary.index(token)
+        if self.dictionary.index(token) != 3:
+            return self.dictionary.index(token)
+        return self.dictionary.index(str(self.gpt2_tokenizer.convert_tokens_to_ids(token)))
 
     def _convert_id_to_token(self, index):
         symbol = self.dictionary[index]
         try:
             idx = int(symbol)
             return self.gpt2_tokenizer._convert_id_to_token(idx)
-        except:
+        except ValueError:
             return symbol
 
     def convert_tokens_to_string(self, tokens):
         return self.gpt2_tokenizer.convert_tokens_to_string(tokens)
 
+    def convert_tokens_to_ids(self, tokens, no_sep_cls_tokens=False):
+        cls = [self._convert_token_to_id(self.cls_token)]
+        tokens = super().convert_tokens_to_ids(tokens)
+        sep = [self._convert_token_to_id(self.sep_token)]
+        return (cls + tokens + sep) if (isinstance(tokens, list) and not no_sep_cls_tokens) else tokens
+
     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        # Remove the first and last tokens which are cls and sep tokens
-        ids = ids[1:-1]
-        # If multi sentence, then split (multi sentence found by looking for two sequential sep tokens)
-        ids = [list(map(int, example.split(' '))) for example in ' '.join([str(id) for id in ids]).split(' 2 2 ')]
+        return super().convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)[1:-1]
 
-        if len(ids) == 1:
-            tokens = self.gpt2_tokenizer.convert_ids_to_tokens(list(map(lambda id: int(self.dictionary[id]), ids[0])))
-        else:
-            tokens = []
-            for example in ids:
-                tokens += [
-                    self.gpt2_tokenizer.convert_ids_to_tokens(list(map(lambda id: int(self.dictionary[id]), example)))]
-        return tokens
+    def save_vocabulary(self, save_directory):
+        """Save the tokenizer vocabulary and merge files to a directory."""
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        dict_file = os.path.join(save_directory, DICT_FILES_NAMES['dict_file'])
 
-    def convert_tokens_to_ids(self, tokens):
-        tokens = " ".join(str(x) for x in self.gpt2_tokenizer.convert_tokens_to_ids(tokens))
-        bpe_sentence = '<s> ' + tokens + ' </s>'
-        return self.dictionary.encode_line(bpe_sentence, append_eos=False)
+        with open(dict_file, 'w', encoding='utf-8') as f:
+            for i in range(self.dictionary.nspecial, len(self.dictionary.count)):
+                f.write(f"{list(self.dictionary.indices.keys())[i]} {self.dictionary.count[i]}\n")
 
+        vocab_files = self.gpt2_tokenizer.save_pretrained(save_directory)
+
+        return vocab_files + (dict_file,)
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 2e75c83bfb..232ef1c35b 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -495,7 +495,7 @@ class PreTrainedTokenizer(object):
         """
         raise NotImplementedError
 
-    def convert_tokens_to_ids(self, tokens):
+    def convert_tokens_to_ids(self, tokens, **kwargs):
         """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
             (resp. a sequence of ids), using the vocabulary.
         """
@@ -520,12 +520,29 @@ class PreTrainedTokenizer(object):
         raise NotImplementedError
 
 
-    def encode(self, text):
+    def encode(self, *text, cls_token_at_end=False, double_sep_token=False, no_sep_cls_tokens=False):
         """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
         
         Same doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
         """
-        return self.convert_tokens_to_ids(self.tokenize(text))
+
+        if len(text) == 1:
+            return self.convert_tokens_to_ids(self.tokenize(text[0]), no_sep_cls_tokens=no_sep_cls_tokens)
+
+        if len(text) > 2:
+            logger.warning("Tokenization currently only supports sentence pairs. Ignoring every string following the "
+                           "initial two.")
+
+        first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text[0])]
+        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text[1])]
+        sep = [self._convert_token_to_id(self.sep_token)]
+        cls = [self._convert_token_to_id(self.cls_token)]
+        n_sep_token = 2 if double_sep_token else 1
+
+        tokens = first_sentence_tokens + sep * n_sep_token + second_sentence_tokens + sep
+        tokens = (tokens + cls) if cls_token_at_end else (cls + tokens)
+
+        return tokens
 
 
     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
@@ -560,7 +577,8 @@ class PreTrainedTokenizer(object):
         """
         return ' '.join(self.convert_ids_to_tokens(tokens))
 
-    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
+    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True, cls_token_at_end=False,
+               double_sep_token=False):
         """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
             with options to remove special tokens and clean up tokenization spaces.
 
@@ -568,9 +586,21 @@ class PreTrainedTokenizer(object):
         """
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
         text = self.convert_tokens_to_string(filtered_tokens)
-        if clean_up_tokenization_spaces:
-            text = self.clean_up_tokenization(text)
-        return text
+
+        if self.sep_token is not None and self.sep_token in text:
+            text = text.replace(self.cls_token, self.sep_token)
+            split_text = list(filter(lambda sentence: len(sentence) > 0, text.split(self.sep_token)))
+            if clean_up_tokenization_spaces:
+                clean_text = [self.clean_up_tokenization(text) for text in split_text]
+                return clean_text
+            else:
+                return split_text
+        else:
+            if clean_up_tokenization_spaces:
+                clean_text = self.clean_up_tokenization(text)
+                return clean_text
+            else:
+                return text
 
     @property
     def special_tokens_map(self):
@@ -602,7 +632,7 @@ class PreTrainedTokenizer(object):
             class attributes (cls_token, unk_token...).
         """
         all_toks = self.all_special_tokens
-        all_ids = list(self.convert_tokens_to_ids(t) for t in all_toks)
+        all_ids = list(self._convert_token_to_id(t) for t in all_toks)
         return all_ids
 
     @staticmethod

From fbd746bd065a9aaacd1ef25840cdc9ec957e8cac Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 8 Aug 2019 18:21:34 -0400
Subject: [PATCH 09/45] Updated test architecture

---
 .../tests/modeling_roberta_test.py            | 43 +++++++++++-
 .../tests/tokenization_roberta_test.py        | 70 +++++++++++++------
 .../tests/tokenization_tests_commons.py       |  5 +-
 3 files changed, 91 insertions(+), 27 deletions(-)

diff --git a/pytorch_transformers/tests/modeling_roberta_test.py b/pytorch_transformers/tests/modeling_roberta_test.py
index 36145466b9..e0455d8508 100644
--- a/pytorch_transformers/tests/modeling_roberta_test.py
+++ b/pytorch_transformers/tests/modeling_roberta_test.py
@@ -19,8 +19,9 @@ from __future__ import print_function
 import unittest
 import shutil
 import pytest
+import torch
 
-from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM)
+from pytorch_transformers import (RobertaConfig, RobertaModel, RobertaForMaskedLM, RobertaForSequenceClassification)
 from pytorch_transformers.modeling_roberta import ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
 
 from .modeling_common_test import (CommonTestCases, ConfigTester, ids_tensor)
@@ -156,6 +157,42 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
             inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
             return config, inputs_dict
 
+        def test_inference_masked_lm(self):
+            model = RobertaForMaskedLM.from_pretrained('roberta-base')
+
+            input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+            output = model(input_ids)[0]
+            expected_shape = torch.Size((1, 11, 50265))
+            self.assertEqual(
+                output.shape,
+                expected_shape
+            )
+            # compare the actual values for a slice.
+            expected_slice = torch.Tensor(
+                [[[33.8843, -4.3107, 22.7779],
+                  [4.6533, -2.8099, 13.6252],
+                  [1.8222, -3.6898, 8.8600]]]
+            )
+            self.assertTrue(
+                torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
+            )
+
+        # @pytest.mark.slow
+        def test_inference_no_head(self):
+            model = RobertaModel.from_pretrained('roberta-base')
+
+            input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+            output = model(input_ids)[0]
+            # compare the actual values for a slice.
+            expected_slice = torch.Tensor(
+                [[[-0.0231, 0.0782, 0.0074],
+                  [-0.1854, 0.0539, -0.0174],
+                  [0.0548, 0.0799, 0.1687]]]
+            )
+            self.assertTrue(
+                torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
+            )
+
     def setUp(self):
         self.model_tester = RobertaModelTest.RobertaModelTester(self)
         self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
@@ -183,7 +220,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
 
 class RobertaModelIntegrationTest(unittest.TestCase):
 
-    @pytest.mark.slow
+    # @pytest.mark.slow
     def test_inference_masked_lm(self):
         model = RobertaForMaskedLM.from_pretrained('roberta-base')
         
@@ -204,7 +241,7 @@ class RobertaModelIntegrationTest(unittest.TestCase):
             torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
         )
 
-    @pytest.mark.slow
+    # @pytest.mark.slow
     def test_inference_no_head(self):
         model = RobertaModel.from_pretrained('roberta-base')
         
diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py
index 60df18ae2b..fbb3f8381d 100644
--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -15,42 +15,68 @@
 from __future__ import absolute_import, division, print_function, unicode_literals
 
 import os
+import json
 import unittest
 
-from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
-from .tokenization_tests_commons import create_and_check_tokenizer_commons, TemporaryDirectory
+from pytorch_transformers.tokenization_roberta import RobertaTokenizer, DICT_FILES_NAMES
+from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
+from .tokenization_tests_commons import CommonTestCases
 
 
-class RobertaTokenizationTest(unittest.TestCase):
+class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
+    tokenizer_class = RobertaTokenizer
 
-    def test_full_tokenizer(self):
-        """ Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt """
+    def setUp(self):
+        super(RobertaTokenizationTest, self).setUp()
+
+        # Adapted from Sennrich et al. 2015 and https://github.com/rsennrich/subword-nmt
         vocab = ["l", "o", "w", "e", "r", "s", "t", "i", "d", "n",
                  "lo", "low", "er",
                  "low", "lowest", "newer", "wider", "<unk>"]
         vocab_tokens = dict(zip(vocab, range(len(vocab))))
-        special_tokens_map = {"unk_token": "<unk>"}
+        merges = ["#version: 0.2", "l o", "lo w", "e r", ""]
+        self.special_tokens_map = {"unk_token": "<unk>"}
 
-        with TemporaryDirectory() as tmpdirname:
-            vocab_file = os.path.join(tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
-            with open(vocab_file, "w") as fp:
-                [fp.write(f"{vocab} {index}\n") for index, vocab in enumerate(vocab_tokens)]
+        self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['vocab_file'])
+        self.merges_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES['merges_file'])
+        with open(self.vocab_file, "w") as fp:
+            fp.write(json.dumps(vocab_tokens))
+        with open(self.merges_file, "w") as fp:
+            fp.write("\n".join(merges))
 
-            input_text = u"lower newer"
-            output_text = u"lower<unk>newer"
+    def get_tokenizer(self):
+        bpe_tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
+        return RobertaTokenizer.from_pretrained("roberta-base", bpe_tokenizer=bpe_tokenizer)
 
-            create_and_check_tokenizer_commons(self, input_text, output_text, RobertaTokenizer, tmpdirname, **special_tokens_map)
+    def get_input_output_texts(self):
+        input_text = u"lower newer"
+        output_text = u"lower<unk>newer"
+        return input_text, output_text
 
-            tokenizer = RobertaTokenizer(vocab_file, **special_tokens_map)
-            text = "lower"
-            bpe_tokens = ["low", "er"]
-            tokens = tokenizer.tokenize(text)
-            self.assertListEqual(tokens, bpe_tokens)
+    def test_full_tokenizer(self):
+        tokenizer = self.get_tokenizer()
+        text = "lower"
+        bpe_tokens = ["low", "er"]
+        tokens = tokenizer.tokenize(text)
+        self.assertListEqual(tokens, bpe_tokens)
 
-            input_tokens = tokens + [tokenizer.unk_token]
-            input_bpe_tokens = [13, 12, 17]
-            self.assertListEqual(
-                tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+        input_tokens = tokens + [tokenizer.unk_token]
+        input_bpe_tokens = [0, 4, 12, 176, 2]
+        tokenizer.convert_tokens_to_ids(input_tokens)
+        self.assertListEqual(
+            tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
+
+    def roberta_dict_integration_testing(self):
+        tokenizer = self.get_tokenizer()
+
+        self.assertListEqual(
+            tokenizer.encode('Hello world!'),
+            [0, 31414, 232, 328, 2]
+        )
+        self.assertListEqual(
+            tokenizer.encode('Hello world! cécé herlolip'),
+            [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
+        )
 
 
 if __name__ == '__main__':
diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index ebcf6f48d8..e766a825a0 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -105,7 +105,7 @@ class CommonTestCases:
             self.assertEqual(added_toks, len(new_toks))
             self.assertEqual(all_size_2, all_size + len(new_toks))
 
-            tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
+            tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l", no_sep_cls_tokens=True)
             self.assertGreaterEqual(len(tokens), 4)
             self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
             self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
@@ -121,7 +121,8 @@ class CommonTestCases:
             self.assertEqual(added_toks_2, len(new_toks_2))
             self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
 
-            tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
+            tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
+                                      no_sep_cls_tokens=True)
 
             self.assertGreaterEqual(len(tokens), 6)
             self.assertGreater(tokens[0], tokenizer.vocab_size - 1)

From 14e970c271f8c1f21d46aaadf7e89852d329d3a8 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 9 Aug 2019 15:01:38 -0400
Subject: [PATCH 10/45] Tokenization encode/decode class-based sequence
 handling

---
 .../tests/tokenization_tests_commons.py       |  5 ++-
 pytorch_transformers/tokenization_bert.py     |  8 +++++
 pytorch_transformers/tokenization_utils.py    | 35 ++++++++++---------
 pytorch_transformers/tokenization_xlm.py      |  8 +++++
 pytorch_transformers/tokenization_xlnet.py    | 10 ++++++
 5 files changed, 47 insertions(+), 19 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_tests_commons.py b/pytorch_transformers/tests/tokenization_tests_commons.py
index e766a825a0..ebcf6f48d8 100644
--- a/pytorch_transformers/tests/tokenization_tests_commons.py
+++ b/pytorch_transformers/tests/tokenization_tests_commons.py
@@ -105,7 +105,7 @@ class CommonTestCases:
             self.assertEqual(added_toks, len(new_toks))
             self.assertEqual(all_size_2, all_size + len(new_toks))
 
-            tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l", no_sep_cls_tokens=True)
+            tokens = tokenizer.encode("aaaaabbbbbb low cccccccccdddddddd l")
             self.assertGreaterEqual(len(tokens), 4)
             self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
             self.assertGreater(tokens[-2], tokenizer.vocab_size - 1)
@@ -121,8 +121,7 @@ class CommonTestCases:
             self.assertEqual(added_toks_2, len(new_toks_2))
             self.assertEqual(all_size_3, all_size_2 + len(new_toks_2))
 
-            tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l",
-                                      no_sep_cls_tokens=True)
+            tokens = tokenizer.encode(">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||>|>>>>|> l")
 
             self.assertGreaterEqual(len(tokens), 6)
             self.assertGreater(tokens[0], tokenizer.vocab_size - 1)
diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index 9bf18a97d7..9f4f00a300 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -166,6 +166,14 @@ class BertTokenizer(PreTrainedTokenizer):
         out_string = ' '.join(tokens).replace(' ##', '').strip()
         return out_string
 
+    def add_special_tokens_single_sentence(self, token_ids):
+        return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
+
+    def add_special_tokens_sentences_pair(self, *token_ids):
+        sep = [self._convert_token_to_id(self.sep_token)]
+        cls = [self._convert_token_to_id(self.cls_token)]
+        return cls + token_ids[0] + sep + token_ids[1] + sep
+
     def save_vocabulary(self, vocab_path):
         """Save the tokenizer vocabulary to a directory or file."""
         index = 0
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 232ef1c35b..a3581fe582 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -495,7 +495,7 @@ class PreTrainedTokenizer(object):
         """
         raise NotImplementedError
 
-    def convert_tokens_to_ids(self, tokens, **kwargs):
+    def convert_tokens_to_ids(self, tokens):
         """ Converts a single token, or a sequence of tokens, (str/unicode) in a single integer id
             (resp. a sequence of ids), using the vocabulary.
         """
@@ -519,31 +519,35 @@ class PreTrainedTokenizer(object):
     def _convert_token_to_id(self, token):
         raise NotImplementedError
 
-
-    def encode(self, *text, cls_token_at_end=False, double_sep_token=False, no_sep_cls_tokens=False):
+    def encode(self, text, add_special_tokens=False, *sequences):
         """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
         
         Same doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
         """
 
-        if len(text) == 1:
-            return self.convert_tokens_to_ids(self.tokenize(text[0]), no_sep_cls_tokens=no_sep_cls_tokens)
+        if len(sequences) == 0:
+            if add_special_tokens:
+                return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text)))
+            else:
+                return self.convert_tokens_to_ids(self.tokenize(text))
 
-        if len(text) > 2:
+        if len(sequences) > 1:
             logger.warning("Tokenization currently only supports sentence pairs. Ignoring every string following the "
                            "initial two.")
 
-        first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text[0])]
-        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text[1])]
-        sep = [self._convert_token_to_id(self.sep_token)]
-        cls = [self._convert_token_to_id(self.cls_token)]
-        n_sep_token = 2 if double_sep_token else 1
+        first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text)]
+        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(sequences[0])]
 
-        tokens = first_sentence_tokens + sep * n_sep_token + second_sentence_tokens + sep
-        tokens = (tokens + cls) if cls_token_at_end else (cls + tokens)
+        if add_special_tokens:
+            return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)
+        else:
+            return first_sentence_tokens, second_sentence_tokens
 
-        return tokens
+    def add_special_tokens_single_sentence(self, token_ids):
+        raise NotImplementedError
 
+    def add_special_tokens_sentences_pair(self, *token_ids):
+        raise NotImplementedError
 
     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
         """ Converts a single index or a sequence of indices (integers) in a token "
@@ -577,8 +581,7 @@ class PreTrainedTokenizer(object):
         """
         return ' '.join(self.convert_ids_to_tokens(tokens))
 
-    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True, cls_token_at_end=False,
-               double_sep_token=False):
+    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
         """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
             with options to remove special tokens and clean up tokenization spaces.
 
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index 899f6b884f..b0b8f1d78d 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -214,6 +214,14 @@ class XLMTokenizer(PreTrainedTokenizer):
         out_string = ''.join(tokens).replace('</w>', ' ').strip()
         return out_string
 
+    def add_special_tokens_single_sentence(self, token_ids):
+        return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
+
+    def add_special_tokens_sentences_pair(self, *token_ids):
+        sep = [self._convert_token_to_id(self.sep_token)]
+        cls = [self._convert_token_to_id(self.cls_token)]
+        return cls + token_ids[0] + sep + token_ids[1] + sep
+
     def save_vocabulary(self, save_directory):
         """Save the tokenizer vocabulary and merge files to a directory."""
         if not os.path.isdir(save_directory):
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index 919ac97bce..42473da860 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -177,6 +177,16 @@ class XLNetTokenizer(PreTrainedTokenizer):
         out_string = ''.join(tokens).replace(SPIECE_UNDERLINE, ' ').strip()
         return out_string
 
+    def add_special_tokens_single_sentence(self, token_ids):
+        logger.warning("No method was defined for special tokens and single sentence streams in XLNet. "
+                       "Returning token_ids")
+        return token_ids
+
+    def add_special_tokens_sentences_pair(self, *token_ids):
+        sep = [self._convert_token_to_id(self.sep_token)]
+        cls = [self._convert_token_to_id(self.cls_token)]
+        return token_ids[0] + sep + token_ids[1] + sep + cls
+
     def save_vocabulary(self, save_directory):
         """ Save the sentencepiece vocabulary (copy original file) and special tokens file
             to a directory.

From 75d5f98fd2a154bb5bfc0879c4a6e389c6789be5 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 9 Aug 2019 15:02:13 -0400
Subject: [PATCH 11/45] Roberta tokenization + fixed tests (py3 + py2).

---
 .../tests/modeling_roberta_test.py            |  40 +--
 .../tests/tokenization_roberta_test.py        |  11 +-
 pytorch_transformers/tokenization_roberta.py  | 311 ++++++++----------
 3 files changed, 138 insertions(+), 224 deletions(-)

diff --git a/pytorch_transformers/tests/modeling_roberta_test.py b/pytorch_transformers/tests/modeling_roberta_test.py
index e0455d8508..94035e9667 100644
--- a/pytorch_transformers/tests/modeling_roberta_test.py
+++ b/pytorch_transformers/tests/modeling_roberta_test.py
@@ -157,42 +157,6 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
             inputs_dict = {'input_ids': input_ids, 'token_type_ids': token_type_ids, 'attention_mask': input_mask}
             return config, inputs_dict
 
-        def test_inference_masked_lm(self):
-            model = RobertaForMaskedLM.from_pretrained('roberta-base')
-
-            input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-            output = model(input_ids)[0]
-            expected_shape = torch.Size((1, 11, 50265))
-            self.assertEqual(
-                output.shape,
-                expected_shape
-            )
-            # compare the actual values for a slice.
-            expected_slice = torch.Tensor(
-                [[[33.8843, -4.3107, 22.7779],
-                  [4.6533, -2.8099, 13.6252],
-                  [1.8222, -3.6898, 8.8600]]]
-            )
-            self.assertTrue(
-                torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
-            )
-
-        # @pytest.mark.slow
-        def test_inference_no_head(self):
-            model = RobertaModel.from_pretrained('roberta-base')
-
-            input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-            output = model(input_ids)[0]
-            # compare the actual values for a slice.
-            expected_slice = torch.Tensor(
-                [[[-0.0231, 0.0782, 0.0074],
-                  [-0.1854, 0.0539, -0.0174],
-                  [0.0548, 0.0799, 0.1687]]]
-            )
-            self.assertTrue(
-                torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
-            )
-
     def setUp(self):
         self.model_tester = RobertaModelTest.RobertaModelTester(self)
         self.config_tester = ConfigTester(self, config_class=RobertaConfig, hidden_size=37)
@@ -220,7 +184,7 @@ class RobertaModelTest(CommonTestCases.CommonModelTester):
 
 class RobertaModelIntegrationTest(unittest.TestCase):
 
-    # @pytest.mark.slow
+    @pytest.mark.slow
     def test_inference_masked_lm(self):
         model = RobertaForMaskedLM.from_pretrained('roberta-base')
         
@@ -241,7 +205,7 @@ class RobertaModelIntegrationTest(unittest.TestCase):
             torch.allclose(output[:, :3, :3], expected_slice, atol=1e-3)
         )
 
-    # @pytest.mark.slow
+    @pytest.mark.slow
     def test_inference_no_head(self):
         model = RobertaModel.from_pretrained('roberta-base')
         
diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py
index fbb3f8381d..daefea0fa7 100644
--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -18,8 +18,7 @@ import os
 import json
 import unittest
 
-from pytorch_transformers.tokenization_roberta import RobertaTokenizer, DICT_FILES_NAMES
-from pytorch_transformers.tokenization_gpt2 import GPT2Tokenizer, VOCAB_FILES_NAMES
+from pytorch_transformers.tokenization_roberta import RobertaTokenizer, VOCAB_FILES_NAMES
 from .tokenization_tests_commons import CommonTestCases
 
 
@@ -45,8 +44,7 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
             fp.write("\n".join(merges))
 
     def get_tokenizer(self):
-        bpe_tokenizer = GPT2Tokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
-        return RobertaTokenizer.from_pretrained("roberta-base", bpe_tokenizer=bpe_tokenizer)
+        return RobertaTokenizer.from_pretrained(self.tmpdirname, **self.special_tokens_map)
 
     def get_input_output_texts(self):
         input_text = u"lower newer"
@@ -54,15 +52,14 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
         return input_text, output_text
 
     def test_full_tokenizer(self):
-        tokenizer = self.get_tokenizer()
+        tokenizer = RobertaTokenizer(self.vocab_file, self.merges_file, **self.special_tokens_map)
         text = "lower"
         bpe_tokens = ["low", "er"]
         tokens = tokenizer.tokenize(text)
         self.assertListEqual(tokens, bpe_tokens)
 
         input_tokens = tokens + [tokenizer.unk_token]
-        input_bpe_tokens = [0, 4, 12, 176, 2]
-        tokenizer.convert_tokens_to_ids(input_tokens)
+        input_bpe_tokens = [13, 12, 17]
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index 4ec53a65b0..b01b92653d 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -12,229 +12,182 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for RoBERTa."""
+"""Tokenization classes for OpenAI GPT."""
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 
+import sys
 import json
 import logging
-import re
-from io import open
-import six
 import os
+import regex as re
+from io import open
 
+from .tokenization_gpt2 import bytes_to_unicode, get_pairs
 from .tokenization_utils import PreTrainedTokenizer
-from .tokenization_gpt2 import GPT2Tokenizer
+
+try:
+    from functools import lru_cache
+except ImportError:
+    # Just a dummy decorator to get the checks to run on python2
+    # because honestly I don't want to support a byte-level unicode BPE tokenizer on python 2 right now.
+    def lru_cache():
+        return lambda func: func
 
 logger = logging.getLogger(__name__)
 
-DICT_FILES_NAMES = {
-    'dict_file': 'dict.txt',
+VOCAB_FILES_NAMES = {
+    'vocab_file': 'vocab.json',
+    'merges_file': 'merges.txt',
 }
 
-PRETRAINED_DICT_FILES_MAP = {
-    'dict_file':
-        {
-            'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt",
-            'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt",
-            'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-dict.txt",
-        },
+PRETRAINED_VOCAB_FILES_MAP = {
+    'vocab_file':
+    {
+        'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-vocab.json",
+        'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-vocab.json",
+        'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-vocab.json",
+    },
+    'merges_file':
+    {
+        'roberta-base': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-base-merges.txt",
+        'roberta-large': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-merges.txt",
+        'roberta-large-mnli': "https://s3.amazonaws.com/models.huggingface.co/bert/roberta-large-mnli-merges.txt",
+    },
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'roberta-base': 512,
-    'roberta-large': 512,
-    'roberta-large-mnli': 512,
+    'roberta-base': 1024,
+    'roberta-large': 1024,
+    'roberta-large-mnli': 1024,
 }
 
-SPACE_NORMALIZER = re.compile(r"\s+")
-
-def tokenize_line(line):
-    line = SPACE_NORMALIZER.sub(" ", line)
-    line = line.strip()
-    return line.split()
-
-
-class Dictionary(object):
-    """
-    A mapping from symbols to consecutive integers
-
-    From Facebook's fairseq.
-    """
-
-    def __init__(
-        self,
-        pad='<pad>',
-        eos='</s>',
-        unk='<unk>',
-        bos='<s>',
-        extra_special_symbols=None,
-    ):
-        self.unk_word, self.pad_word, self.eos_word = unk, pad, eos
-        self.symbols = []
-        self.count = []
-        self.indices = {}
-        self.bos_index = self.add_symbol(bos)
-        self.pad_index = self.add_symbol(pad)
-        self.eos_index = self.add_symbol(eos)
-        self.unk_index = self.add_symbol(unk)
-        if extra_special_symbols:
-            for s in extra_special_symbols:
-                self.add_symbol(s)
-        self.nspecial = len(self.symbols)
-
-    def __getitem__(self, idx):
-        if idx < len(self.symbols):
-            return self.symbols[idx]
-        return self.unk_word
-
-    def index(self, sym):
-        """Returns the index of the specified symbol"""
-        assert isinstance(sym, str)
-        if sym in self.indices:
-            return self.indices[sym]
-        return self.unk_index
-
-    def add_symbol(self, word, n=1):
-        """Adds a word to the dictionary"""
-        if word in self.indices:
-            idx = self.indices[word]
-            self.count[idx] = self.count[idx] + n
-            return idx
-        else:
-            idx = len(self.symbols)
-            self.indices[word] = idx
-            self.symbols.append(word)
-            self.count.append(n)
-            return idx
-
-    @classmethod
-    def load(cls, f, ignore_utf_errors=False):
-        """Loads the dictionary from a text file with the format:
-
-        ```
-        <symbol0> <count0>
-        <symbol1> <count1>
-        ...
-        ```
-        """
-        d = cls()
-        d.add_from_file(f, ignore_utf_errors)
-        return d
-
-    def add_from_file(self, f, ignore_utf_errors=False):
-        """
-        Loads a pre-existing dictionary from a text file and adds its symbols
-        to this instance.
-        """
-        if isinstance(f, six.string_types):
-            try:
-                if not ignore_utf_errors:
-                    with open(f, 'r', encoding='utf-8') as fd:
-                        self.add_from_file(fd)
-                else:
-                    with open(f, 'r', encoding='utf-8', errors='ignore') as fd:
-                        self.add_from_file(fd)
-            except FileNotFoundError as fnfe:
-                raise fnfe
-            except UnicodeError:
-                raise Exception("Incorrect encoding detected in {}, please "
-                                "rebuild the dataset".format(f))
-            return
-
-        lines = f.read().splitlines()
-        for line in lines:
-            idx = line.rfind(' ')
-            if idx == -1:
-                raise ValueError("Incorrect dictionary format, expected '<token> <cnt>'")
-            word = line[:idx]
-            count = int(line[idx + 1:])
-            self.indices[word] = len(self.symbols)
-            self.symbols.append(word)
-            self.count.append(count)
-
-    def encode_line(self, line, line_tokenizer=tokenize_line, add_if_not_exist=True,
-                    consumer=None, append_eos=True, reverse_order=False):
-        words = line_tokenizer(line)
-        if reverse_order:
-            words = list(reversed(words))
-        nwords = len(words)
-        ids = [0] * (nwords + 1 if append_eos else nwords)
-
-        for i, word in enumerate(words):
-            if add_if_not_exist:
-                idx = self.add_symbol(word)
-            else:
-                idx = self.index(word)
-            if consumer is not None:
-                consumer(word, idx)
-            ids[i] = idx
-        if append_eos:
-            ids[nwords] = self.eos_index
-        return ids
-
 
 class RobertaTokenizer(PreTrainedTokenizer):
     """
-    RoBERTa tokenizer. Peculiarities:
-        - GPT-2 tokenizer with a different integer mapping on top.
+    GPT-2 BPE tokenizer. Peculiarities:
+        - Byte-level BPE
     """
-    vocab_files_names = DICT_FILES_NAMES
-    pretrained_vocab_files_map = PRETRAINED_DICT_FILES_MAP
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
-    def __init__(self, dict_file, bpe_tokenizer=None, bos_token="<s>", eos_token="</s>", sep_token="</s>", cls_token="<s>",
-                 unk_token="<unk>", **kwargs):
-        super(RobertaTokenizer, self).__init__(cls_token=bos_token, sep_token=eos_token, eos_token=eos_token,
-                                               unk_token=unk_token, **kwargs)
+    def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
+                 cls_token="<s>", unk_token="<unk>", **kwargs):
+        super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
+                                               sep_token=sep_token, cls_token=cls_token, **kwargs)
 
-        self.gpt2_tokenizer = GPT2Tokenizer.from_pretrained("gpt2") if bpe_tokenizer is None else bpe_tokenizer
-        self.dictionary = Dictionary.load(dict_file)
+        self.encoder = json.load(open(vocab_file, encoding="utf-8"))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        bpe_data = open(merges_file, encoding='utf-8').read().split('\n')[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_data]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+
+        # Should haved added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
 
     @property
     def vocab_size(self):
-        return len(self.dictionary.indices)
+        return len(self.encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key = lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word)-1 and word[i+1] == second:
+                    new_word.append(first+second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
 
     def _tokenize(self, text):
-        """ Use GPT-2 Tokenizer """
-        return self.gpt2_tokenizer._tokenize(text)
+        """ Tokenize a string. """
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            if sys.version_info[0] == 2:
+                token = ''.join(self.byte_encoder[ord(b)] for b in token)
+            else:
+                token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
 
     def _convert_token_to_id(self, token):
-        if self.dictionary.index(token) != 3:
-            return self.dictionary.index(token)
-        return self.dictionary.index(str(self.gpt2_tokenizer.convert_tokens_to_ids(token)))
+        """ Converts a token (str/unicode) in an id using the vocab. """
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
 
     def _convert_id_to_token(self, index):
-        symbol = self.dictionary[index]
-        try:
-            idx = int(symbol)
-            return self.gpt2_tokenizer._convert_id_to_token(idx)
-        except ValueError:
-            return symbol
+        """Converts an index (integer) in a token (string/unicode) using the vocab."""
+        return self.decoder.get(index)
 
     def convert_tokens_to_string(self, tokens):
-        return self.gpt2_tokenizer.convert_tokens_to_string(tokens)
+        """ Converts a sequence of tokens (string) in a single string. """
+        text = ''.join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode('utf-8', errors=self.errors)
+        return text
 
-    def convert_tokens_to_ids(self, tokens, no_sep_cls_tokens=False):
-        cls = [self._convert_token_to_id(self.cls_token)]
-        tokens = super().convert_tokens_to_ids(tokens)
+    def add_special_tokens_single_sentence(self, token_ids):
+        return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
+
+    def add_special_tokens_sentences_pair(self, *token_ids):
         sep = [self._convert_token_to_id(self.sep_token)]
-        return (cls + tokens + sep) if (isinstance(tokens, list) and not no_sep_cls_tokens) else tokens
-
-    def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
-        return super().convert_ids_to_tokens(ids, skip_special_tokens=skip_special_tokens)[1:-1]
+        cls = [self._convert_token_to_id(self.cls_token)]
+        return cls + token_ids[0] + sep + sep + token_ids[1] + sep
 
     def save_vocabulary(self, save_directory):
         """Save the tokenizer vocabulary and merge files to a directory."""
         if not os.path.isdir(save_directory):
             logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
             return
-        dict_file = os.path.join(save_directory, DICT_FILES_NAMES['dict_file'])
+        vocab_file = os.path.join(save_directory, VOCAB_FILES_NAMES['vocab_file'])
+        merge_file = os.path.join(save_directory, VOCAB_FILES_NAMES['merges_file'])
 
-        with open(dict_file, 'w', encoding='utf-8') as f:
-            for i in range(self.dictionary.nspecial, len(self.dictionary.count)):
-                f.write(f"{list(self.dictionary.indices.keys())[i]} {self.dictionary.count[i]}\n")
+        with open(vocab_file, 'w', encoding='utf-8') as f:
+            f.write(json.dumps(self.encoder, ensure_ascii=False))
 
-        vocab_files = self.gpt2_tokenizer.save_pretrained(save_directory)
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write(u'#version: 0.2\n')
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning("Saving vocabulary to {}: BPE merge indices are not consecutive."
+                                   " Please check that the tokenizer is not corrupted!".format(merge_file))
+                    index = token_index
+                writer.write(' '.join(bpe_tokens) + u'\n')
+                index += 1
 
-        return vocab_files + (dict_file,)
+        return vocab_file, merge_file

From a7b4cfe9194bf93c7044a42c9f1281260ce6279e Mon Sep 17 00:00:00 2001
From: carefree0910 <syameimaru_kurumi@pku.edu.cn>
Date: Sun, 11 Aug 2019 21:36:51 +0800
Subject: [PATCH 12/45] Update README.md

I assume that it should test the `re-load` functionality after testing the `save` functionality, however I'm also surprised that nobody points this out after such a long time, so maybe I've misunderstood the purpose. This PR is just in case :)
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index b86a5238c2..48c54a055a 100644
--- a/README.md
+++ b/README.md
@@ -123,7 +123,7 @@ traced_model = torch.jit.trace(model, (input_ids,))
 model.save_pretrained('./directory/to/save/')  # save
 model = model_class.from_pretrained('./directory/to/save/')  # re-load
 tokenizer.save_pretrained('./directory/to/save/')  # save
-tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
+tokenizer = tokenizer_class.from_pretrained('./directory/to/save/')  # re-load
 
 # SOTA examples for GLUE, SQUAD, text generation...
 ```

From b3d83d68db2db037a439516c24c593d4a85035a7 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Mon, 12 Aug 2019 12:28:55 -0400
Subject: [PATCH 13/45] Fixup 9d0603148bc34255fad0cad73ce438ecd7306322

---
 .../convert_roberta_checkpoint_to_pytorch.py                 | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
index e4e8fbb25d..0a8967426e 100644
--- a/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
+++ b/pytorch_transformers/convert_roberta_checkpoint_to_pytorch.py
@@ -139,7 +139,10 @@ def convert_roberta_checkpoint_to_pytorch(roberta_checkpoint_path, pytorch_dump_
     input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0) # batch of size 1
 
     our_output = model(input_ids)[0]
-    their_output = roberta.model(input_ids)[0]
+    if classification_head:
+        their_output = roberta.model.classification_heads['mnli'](roberta.extract_features(input_ids))
+    else:
+        their_output = roberta.model(input_ids)[0]
     print(our_output.shape, their_output.shape)
     success = torch.allclose(our_output, their_output, atol=1e-3)
     print(

From 912fdff899cf0fd674ed357e46a0209311aefad2 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Mon, 12 Aug 2019 13:49:50 -0400
Subject: [PATCH 14/45] [RoBERTa] Update `run_glue` for RoBERTa

---
 examples/run_glue.py   | 13 +++++++++----
 examples/utils_glue.py | 17 +++++++++++++----
 2 files changed, 22 insertions(+), 8 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index a939ea373b..f6cd73ed0b 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -13,7 +13,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet)."""
+""" Finetuning the library models for sequence classification on GLUE (Bert, XLM, XLNet, RoBERTa)."""
 
 from __future__ import absolute_import, division, print_function
 
@@ -33,6 +33,9 @@ from tqdm import tqdm, trange
 
 from pytorch_transformers import (WEIGHTS_NAME, BertConfig,
                                   BertForSequenceClassification, BertTokenizer,
+                                  RobertaConfig,
+                                  RobertaForSequenceClassification,
+                                  RobertaTokenizer,
                                   XLMConfig, XLMForSequenceClassification,
                                   XLMTokenizer, XLNetConfig,
                                   XLNetForSequenceClassification,
@@ -45,12 +48,13 @@ from utils_glue import (compute_metrics, convert_examples_to_features,
 
 logger = logging.getLogger(__name__)
 
-ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig)), ())
+ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, XLNetConfig, XLMConfig, RobertaConfig)), ())
 
 MODEL_CLASSES = {
     'bert': (BertConfig, BertForSequenceClassification, BertTokenizer),
     'xlnet': (XLNetConfig, XLNetForSequenceClassification, XLNetTokenizer),
     'xlm': (XLMConfig, XLMForSequenceClassification, XLMTokenizer),
+    'roberta': (RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer),
 }
 
 
@@ -214,7 +218,7 @@ def evaluate(args, model, tokenizer, prefix=""):
             with torch.no_grad():
                 inputs = {'input_ids':      batch[0],
                           'attention_mask': batch[1],
-                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM don't use segment_ids
+                          'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None,  # XLM and RoBERTa don't use segment_ids
                           'labels':         batch[3]}
                 outputs = model(**inputs)
                 tmp_eval_loss, logits = outputs[:2]
@@ -268,8 +272,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
         features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
             cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
             cls_token=tokenizer.cls_token,
-            sep_token=tokenizer.sep_token,
             cls_token_segment_id=2 if args.model_type in ['xlnet'] else 0,
+            sep_token=tokenizer.sep_token,
+            sep_token_extra=bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
             pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
             pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
         if args.local_rank in [-1, 0]:
diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index bba9a901a8..c955e4d0ce 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -390,10 +390,16 @@ class WnliProcessor(DataProcessor):
 
 def convert_examples_to_features(examples, label_list, max_seq_length,
                                  tokenizer, output_mode,
-                                 cls_token_at_end=False, pad_on_left=False,
-                                 cls_token='[CLS]', sep_token='[SEP]', pad_token=0,
-                                 sequence_a_segment_id=0, sequence_b_segment_id=1,
-                                 cls_token_segment_id=1, pad_token_segment_id=0,
+                                 cls_token_at_end=False,
+                                 cls_token='[CLS]',
+                                 cls_token_segment_id=1,
+                                 sep_token='[SEP]',
+                                 sep_token_extra=False,
+                                 pad_on_left=False,
+                                 pad_token=0,
+                                 pad_token_segment_id=0,
+                                 sequence_a_segment_id=0, 
+                                 sequence_b_segment_id=1,
                                  mask_padding_with_zero=True):
     """ Loads a data file into a list of `InputBatch`s
         `cls_token_at_end` define the location of the CLS token:
@@ -442,6 +448,9 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
         # used as as the "sentence vector". Note that this only makes sense because
         # the entire model is fine-tuned.
         tokens = tokens_a + [sep_token]
+        if sep_token_extra:
+            # roberta uses an extra separator b/w pairs of sentences
+            tokens += [sep_token]
         segment_ids = [sequence_a_segment_id] * len(tokens)
 
         if tokens_b:

From 22ac004a7c9cc76d930ecc95b6b0469cd6693b16 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 12 Aug 2019 15:13:53 -0400
Subject: [PATCH 15/45] Added documentation and changed parameters for
 special_tokens_sentences_pair.

---
 pytorch_transformers/tokenization_bert.py    | 12 +++++++++--
 pytorch_transformers/tokenization_roberta.py | 22 +++++++++++++-------
 pytorch_transformers/tokenization_utils.py   |  2 +-
 pytorch_transformers/tokenization_xlm.py     | 12 +++++++++--
 pytorch_transformers/tokenization_xlnet.py   | 20 ++++++++++++------
 5 files changed, 50 insertions(+), 18 deletions(-)

diff --git a/pytorch_transformers/tokenization_bert.py b/pytorch_transformers/tokenization_bert.py
index 9f4f00a300..177d26dec1 100644
--- a/pytorch_transformers/tokenization_bert.py
+++ b/pytorch_transformers/tokenization_bert.py
@@ -167,12 +167,20 @@ class BertTokenizer(PreTrainedTokenizer):
         return out_string
 
     def add_special_tokens_single_sentence(self, token_ids):
+        """
+        Adds special tokens to the a sequence for sequence classification tasks.
+        A BERT sequence has the following format: [CLS] X [SEP]
+        """
         return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
 
-    def add_special_tokens_sentences_pair(self, *token_ids):
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        A BERT sequence pair has the following format: [CLS] A [SEP] B [SEP]
+        """
         sep = [self._convert_token_to_id(self.sep_token)]
         cls = [self._convert_token_to_id(self.cls_token)]
-        return cls + token_ids[0] + sep + token_ids[1] + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
 
     def save_vocabulary(self, vocab_path):
         """Save the tokenizer vocabulary to a directory or file."""
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index b01b92653d..8f5cecee8a 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-"""Tokenization classes for OpenAI GPT."""
+"""Tokenization classes for RoBERTa."""
 from __future__ import (absolute_import, division, print_function,
                         unicode_literals)
 
@@ -57,15 +57,15 @@ PRETRAINED_VOCAB_FILES_MAP = {
 }
 
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    'roberta-base': 1024,
-    'roberta-large': 1024,
-    'roberta-large-mnli': 1024,
+    'roberta-base': 512,
+    'roberta-large': 512,
+    'roberta-large-mnli': 512,
 }
 
 
 class RobertaTokenizer(PreTrainedTokenizer):
     """
-    GPT-2 BPE tokenizer. Peculiarities:
+    RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
         - Byte-level BPE
     """
     vocab_files_names = VOCAB_FILES_NAMES
@@ -161,12 +161,20 @@ class RobertaTokenizer(PreTrainedTokenizer):
         return text
 
     def add_special_tokens_single_sentence(self, token_ids):
+        """
+        Adds special tokens to a sequence for sequence classification tasks.
+        A RoBERTa sequence has the following format: [CLS] X [SEP]
+        """
         return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
 
-    def add_special_tokens_sentences_pair(self, *token_ids):
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        A RoBERTa sequence pair has the following format: [CLS] A [SEP][SEP] B [SEP]
+        """
         sep = [self._convert_token_to_id(self.sep_token)]
         cls = [self._convert_token_to_id(self.cls_token)]
-        return cls + token_ids[0] + sep + sep + token_ids[1] + sep
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
 
     def save_vocabulary(self, save_directory):
         """Save the tokenizer vocabulary and merge files to a directory."""
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index a3581fe582..3253596058 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -546,7 +546,7 @@ class PreTrainedTokenizer(object):
     def add_special_tokens_single_sentence(self, token_ids):
         raise NotImplementedError
 
-    def add_special_tokens_sentences_pair(self, *token_ids):
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
         raise NotImplementedError
 
     def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
diff --git a/pytorch_transformers/tokenization_xlm.py b/pytorch_transformers/tokenization_xlm.py
index b0b8f1d78d..b690a3a945 100644
--- a/pytorch_transformers/tokenization_xlm.py
+++ b/pytorch_transformers/tokenization_xlm.py
@@ -215,12 +215,20 @@ class XLMTokenizer(PreTrainedTokenizer):
         return out_string
 
     def add_special_tokens_single_sentence(self, token_ids):
+        """
+        Adds special tokens to a sequence for sequence classification tasks.
+        An XLM sequence has the following format: [CLS] X [SEP]
+        """
         return [self._convert_token_to_id(self.cls_token)] + token_ids + [self._convert_token_to_id(self.sep_token)]
 
-    def add_special_tokens_sentences_pair(self, *token_ids):
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        An XLM sequence pair has the following format: [CLS] A [SEP] B [SEP]
+        """
         sep = [self._convert_token_to_id(self.sep_token)]
         cls = [self._convert_token_to_id(self.cls_token)]
-        return cls + token_ids[0] + sep + token_ids[1] + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
 
     def save_vocabulary(self, save_directory):
         """Save the tokenizer vocabulary and merge files to a directory."""
diff --git a/pytorch_transformers/tokenization_xlnet.py b/pytorch_transformers/tokenization_xlnet.py
index 42473da860..371b3c9407 100644
--- a/pytorch_transformers/tokenization_xlnet.py
+++ b/pytorch_transformers/tokenization_xlnet.py
@@ -178,14 +178,22 @@ class XLNetTokenizer(PreTrainedTokenizer):
         return out_string
 
     def add_special_tokens_single_sentence(self, token_ids):
-        logger.warning("No method was defined for special tokens and single sentence streams in XLNet. "
-                       "Returning token_ids")
-        return token_ids
-
-    def add_special_tokens_sentences_pair(self, *token_ids):
+        """
+        Adds special tokens to a sequence pair for sequence classification tasks.
+        An XLNet sequence pair has the following format: A [SEP] B [SEP][CLS]
+        """
         sep = [self._convert_token_to_id(self.sep_token)]
         cls = [self._convert_token_to_id(self.cls_token)]
-        return token_ids[0] + sep + token_ids[1] + sep + cls
+        return token_ids + sep + cls
+
+    def add_special_tokens_sentences_pair(self, token_ids_0, token_ids_1):
+        """
+        Adds special tokens to a sequence for sequence classification tasks.
+        An XLNet sequence has the following format: X [SEP][CLS]
+        """
+        sep = [self._convert_token_to_id(self.sep_token)]
+        cls = [self._convert_token_to_id(self.cls_token)]
+        return token_ids_0 + sep + token_ids_1 + sep + cls
 
     def save_vocabulary(self, save_directory):
         """ Save the sentencepiece vocabulary (copy original file) and special tokens file

From 634a3172d869e2ff772b2e0813169641ca9e6cc5 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Mon, 12 Aug 2019 15:14:15 -0400
Subject: [PATCH 16/45] Added integration tests for sequence builders.

---
 .../tests/tokenization_bert_test.py                | 11 +++++++++++
 .../tests/tokenization_roberta_test.py             | 14 +++++++++++++-
 .../tests/tokenization_xlm_test.py                 | 11 +++++++++++
 .../tests/tokenization_xlnet_test.py               | 12 ++++++++++++
 4 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/pytorch_transformers/tests/tokenization_bert_test.py b/pytorch_transformers/tests/tokenization_bert_test.py
index 5eb39b729d..db507317a8 100644
--- a/pytorch_transformers/tests/tokenization_bert_test.py
+++ b/pytorch_transformers/tests/tokenization_bert_test.py
@@ -125,6 +125,17 @@ class BertTokenizationTest(CommonTestCases.CommonTokenizerTester):
         self.assertFalse(_is_punctuation(u"A"))
         self.assertFalse(_is_punctuation(u" "))
 
+    def test_sequence_builders(self):
+        tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+
+        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
+        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+
+        assert encoded_sentence == [101] + text + [102]
+        assert encoded_pair == [101] + text + [102] + text_2 + [102]
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py
index daefea0fa7..b76b3e311d 100644
--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -71,10 +71,22 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
             [0, 31414, 232, 328, 2]
         )
         self.assertListEqual(
-            tokenizer.encode('Hello world! cécé herlolip'),
+            tokenizer.encode('Hello world! cécé herlolip 418'),
             [0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]
         )
 
+    def test_sequence_builders(self):
+        tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+
+        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
+        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+
+        assert encoded_sentence == [0] + text + [2]
+        assert encoded_pair == [0] + text + [2, 2] + text_2 + [2]
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_xlm_test.py b/pytorch_transformers/tests/tokenization_xlm_test.py
index a20e92044f..ede77a1f98 100644
--- a/pytorch_transformers/tests/tokenization_xlm_test.py
+++ b/pytorch_transformers/tests/tokenization_xlm_test.py
@@ -66,6 +66,17 @@ class XLMTokenizationTest(CommonTestCases.CommonTokenizerTester):
         self.assertListEqual(
             tokenizer.convert_tokens_to_ids(input_tokens), input_bpe_tokens)
 
+    def test_sequence_builders(self):
+        tokenizer = XLMTokenizer.from_pretrained("xlm-mlm-en-2048")
+
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+
+        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
+        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+
+        assert encoded_sentence == [1] + text + [1]
+        assert encoded_pair == [1] + text + [1] + text_2 + [1]
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/pytorch_transformers/tests/tokenization_xlnet_test.py b/pytorch_transformers/tests/tokenization_xlnet_test.py
index 08e9e9cb2d..9feab7c0bd 100644
--- a/pytorch_transformers/tests/tokenization_xlnet_test.py
+++ b/pytorch_transformers/tests/tokenization_xlnet_test.py
@@ -89,6 +89,18 @@ class XLNetTokenizationTest(CommonTestCases.CommonTokenizerTester):
                                       u'9', u'2', u'0', u'0', u'0', u',', SPIECE_UNDERLINE + u'and', SPIECE_UNDERLINE + u'this',
                                       SPIECE_UNDERLINE + u'is', SPIECE_UNDERLINE + u'f', u'al', u'se', u'.'])
 
+    def test_sequence_builders(self):
+        tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")
+
+        text = tokenizer.encode("sequence builders")
+        text_2 = tokenizer.encode("multi-sequence build")
+
+        encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
+        encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
+
+        assert encoded_sentence == text + [4, 3]
+        assert encoded_pair == text + [4] + text_2 + [4, 3]
+
 
 if __name__ == '__main__':
     unittest.main()

From ba4bce2581f9a67caa44c3cc959a2dacb0090670 Mon Sep 17 00:00:00 2001
From: tuvuumass <tuvu@cs.umass.edu>
Date: Tue, 13 Aug 2019 11:26:27 -0400
Subject: [PATCH 17/45] fix issue #824

---
 examples/run_bertology.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/examples/run_bertology.py b/examples/run_bertology.py
index 61c7440ecb..f11b73b54f 100644
--- a/examples/run_bertology.py
+++ b/examples/run_bertology.py
@@ -211,10 +211,12 @@ def prune_heads(args, model, eval_dataloader, head_mask):
 
 def main():
     parser = argparse.ArgumentParser()
+    ## Required parameters
     parser.add_argument("--data_dir", default=None, type=str, required=True,
                         help="The input data dir. Should contain the .tsv files (or other data files) for the task.")
-    parser.add_argument("--model_name", default=None, type=str, required=True,
-                        help="Bert/XLNet/XLM pre-trained model selected in the list: " + ", ".join(ALL_MODELS))
+    parser.add_argument("--model_name_or_path", default=None, type=str, required=True,
+                        help="Path to pre-trained model or shortcut name selected in the list: " + ", ".join(
+                            ALL_MODELS))
     parser.add_argument("--task_name", default=None, type=str, required=True,
                         help="The name of the task to train selected in the list: " + ", ".join(processors.keys()))
     parser.add_argument("--output_dir", default=None, type=str, required=True,
@@ -222,9 +224,9 @@ def main():
 
     ## Other parameters
     parser.add_argument("--config_name", default="", type=str,
-                        help="Pretrained config name or path if not the same as model_name")
+                        help="Pretrained config name or path if not the same as model_name_or_path")
     parser.add_argument("--tokenizer_name", default="", type=str,
-                        help="Pretrained tokenizer name or path if not the same as model_name")
+                        help="Pretrained tokenizer name or path if not the same as model_name_or_path")
     parser.add_argument("--cache_dir", default="", type=str,
                         help="Where do you want to store the pre-trained models downloaded from s3")
     parser.add_argument("--data_subset", type=int, default=-1,
@@ -297,15 +299,15 @@ def main():
 
     args.model_type = ""
     for key in MODEL_CLASSES:
-        if key in args.model_name.lower():
+        if key in args.model_name_or_path.lower():
             args.model_type = key  # take the first match in model types
             break
     config_class, model_class, tokenizer_class = MODEL_CLASSES[args.model_type]
-    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name,
+    config = config_class.from_pretrained(args.config_name if args.config_name else args.model_name_or_path,
                                           num_labels=num_labels, finetuning_task=args.task_name,
                                           output_attentions=True)
-    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name)
-    model = model_class.from_pretrained(args.model_name, from_tf=bool('.ckpt' in args.model_name), config=config)
+    tokenizer = tokenizer_class.from_pretrained(args.tokenizer_name if args.tokenizer_name else args.model_name_or_path)
+    model = model_class.from_pretrained(args.model_name_or_path, from_tf=bool('.ckpt' in args.model_name_or_path), config=config)
 
     if args.local_rank == 0:
         torch.distributed.barrier()  # Make sure only the first process in distributed training will download model & vocab

From 3d87991f606b36dc54318ac3dee9803001ef161d Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 13 Aug 2019 12:00:24 -0400
Subject: [PATCH 18/45] Fixed error with encoding

---
 .../tests/tokenization_roberta_test.py                |  7 +++++--
 pytorch_transformers/tokenization_utils.py            | 11 +++--------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/pytorch_transformers/tests/tokenization_roberta_test.py b/pytorch_transformers/tests/tokenization_roberta_test.py
index b76b3e311d..a8f940ae43 100644
--- a/pytorch_transformers/tests/tokenization_roberta_test.py
+++ b/pytorch_transformers/tests/tokenization_roberta_test.py
@@ -81,11 +81,14 @@ class RobertaTokenizationTest(CommonTestCases.CommonTokenizerTester):
         text = tokenizer.encode("sequence builders")
         text_2 = tokenizer.encode("multi-sequence build")
 
+        encoded_text_from_decode = tokenizer.encode("sequence builders", add_special_tokens=True)
+        encoded_pair_from_decode = tokenizer.encode("sequence builders", "multi-sequence build", add_special_tokens=True)
+
         encoded_sentence = tokenizer.add_special_tokens_single_sentence(text)
         encoded_pair = tokenizer.add_special_tokens_sentences_pair(text, text_2)
 
-        assert encoded_sentence == [0] + text + [2]
-        assert encoded_pair == [0] + text + [2, 2] + text_2 + [2]
+        assert encoded_sentence == encoded_text_from_decode
+        assert encoded_pair == encoded_pair_from_decode
 
 
 if __name__ == '__main__':
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 3253596058..7bb9fd9d29 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -519,24 +519,19 @@ class PreTrainedTokenizer(object):
     def _convert_token_to_id(self, token):
         raise NotImplementedError
 
-    def encode(self, text, add_special_tokens=False, *sequences):
+    def encode(self, text, text_pair=None, add_special_tokens=False):
         """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
         
         Same doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
         """
-
-        if len(sequences) == 0:
+        if text_pair is None:
             if add_special_tokens:
                 return self.add_special_tokens_single_sentence(self.convert_tokens_to_ids(self.tokenize(text)))
             else:
                 return self.convert_tokens_to_ids(self.tokenize(text))
 
-        if len(sequences) > 1:
-            logger.warning("Tokenization currently only supports sentence pairs. Ignoring every string following the "
-                           "initial two.")
-
         first_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text)]
-        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(sequences[0])]
+        second_sentence_tokens = [self._convert_token_to_id(token) for token in self.tokenize(text_pair)]
 
         if add_special_tokens:
             return self.add_special_tokens_sentences_pair(first_sentence_tokens, second_sentence_tokens)

From baf08ca1d4ab5aee1d530fc1801370e8a81cc091 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 13 Aug 2019 12:51:15 -0400
Subject: [PATCH 19/45] [RoBERTa] run_glue: correct pad_token + reorder labels

---
 examples/run_glue.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index f6cd73ed0b..445a9a5912 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -268,6 +268,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
     else:
         logger.info("Creating features from dataset file at %s", args.data_dir)
         label_list = processor.get_labels()
+        if task in ['mnli', 'mnli-mm'] and args.model_type in ['roberta']:
+            # HACK(label indices are swapped in RoBERTa pretrained model)
+            label_list[1], label_list[2] = label_list[2], label_list[1] 
         examples = processor.get_dev_examples(args.data_dir) if evaluate else processor.get_train_examples(args.data_dir)
         features = convert_examples_to_features(examples, label_list, args.max_seq_length, tokenizer, output_mode,
             cls_token_at_end=bool(args.model_type in ['xlnet']),            # xlnet has a cls token at the end
@@ -276,7 +279,9 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             sep_token=tokenizer.sep_token,
             sep_token_extra=bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
             pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0)
+            pad_token=1 if args.model_type in ['roberta'] else 0, # TODO(Lysandre: replace with tokenizer.pad_token when implemented)
+            pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
+        )
         if args.local_rank in [-1, 0]:
             logger.info("Saving features into cached file %s", cached_features_file)
             torch.save(features, cached_features_file)

From 39f426be6577d4534a058c9c42d52053a0ef9257 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Tue, 13 Aug 2019 15:19:50 -0400
Subject: [PATCH 20/45] Added special tokens <pad> and <mask> to RoBERTa.

---
 examples/run_glue.py                         | 2 +-
 pytorch_transformers/tokenization_roberta.py | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 445a9a5912..c0f70e0863 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -279,7 +279,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             sep_token=tokenizer.sep_token,
             sep_token_extra=bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
             pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-            pad_token=1 if args.model_type in ['roberta'] else 0, # TODO(Lysandre: replace with tokenizer.pad_token when implemented)
+            pad_token=tokenizer.encoder[tokenizer.pad_token] if args.model_type in ['roberta'] else tokenizer.vocab[tokenizer.pad_token],
             pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
         )
         if args.local_rank in [-1, 0]:
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index 8f5cecee8a..1db8013183 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -73,9 +73,10 @@ class RobertaTokenizer(PreTrainedTokenizer):
     max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
 
     def __init__(self, vocab_file, merges_file, errors='replace', bos_token="<s>", eos_token="</s>", sep_token="</s>",
-                 cls_token="<s>", unk_token="<unk>", **kwargs):
+                 cls_token="<s>", unk_token="<unk>", pad_token='<pad>', mask_token='<mask>', **kwargs):
         super(RobertaTokenizer, self).__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token,
-                                               sep_token=sep_token, cls_token=cls_token, **kwargs)
+                                               sep_token=sep_token, cls_token=cls_token, pad_token=pad_token,
+                                               mask_token=mask_token, **kwargs)
 
         self.encoder = json.load(open(vocab_file, encoding="utf-8"))
         self.decoder = {v: k for k, v in self.encoder.items()}

From c4ef1034474a1cad80674f1ce4c9fdaaa4d1f937 Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Wed, 14 Aug 2019 12:31:09 -0400
Subject: [PATCH 21/45] [RoBERTa] First 4 authors

cf. https://github.com/huggingface/pytorch-transformers/pull/964#discussion_r313574354

Co-Authored-By: Myle Ott <myleott@fb.com>
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index f77934bbcc..f223394868 100644
--- a/README.md
+++ b/README.md
@@ -12,7 +12,7 @@ The library currently contains PyTorch implementations, pre-trained model weight
 4. **[Transformer-XL](https://github.com/kimiyoung/transformer-xl)** (from Google/CMU) released with the paper [Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context](https://arxiv.org/abs/1901.02860) by Zihang Dai*, Zhilin Yang*, Yiming Yang, Jaime Carbonell, Quoc V. Le, Ruslan Salakhutdinov.
 5. **[XLNet](https://github.com/zihangdai/xlnet/)** (from Google/CMU) released with the paper [​XLNet: Generalized Autoregressive Pretraining for Language Understanding](https://arxiv.org/abs/1906.08237) by Zhilin Yang*, Zihang Dai*, Yiming Yang, Jaime Carbonell, Ruslan Salakhutdinov, Quoc V. Le.
 6. **[XLM](https://github.com/facebookresearch/XLM/)** (from Facebook) released together with the paper [Cross-lingual Language Model Pretraining](https://arxiv.org/abs/1901.07291) by Guillaume Lample and Alexis Conneau.
-7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott et al.
+7. **[RoBERTa](https://github.com/pytorch/fairseq/tree/master/examples/roberta)** (from Facebook), a [Robustly Optimized BERT Pretraining Approach](https://arxiv.org/abs/1907.11692) by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du et al.
 
 These implementations have been tested on several datasets (see the example scripts) and should match the performances of the original implementations (e.g. ~93 F1 on SQuAD for BERT Whole-Word-Masking, ~88 F1 on RocStories for OpenAI GPT, ~18.3 perplexity on WikiText 103 for Transformer-XL, ~0.916 Peason R coefficient on STS-B for XLNet). You can find more details on the performances in the Examples section of the [documentation](https://huggingface.co/pytorch-transformers/examples.html).
 

From 572dcfd1db0bc18fbce8c14cef82de41fdae2465 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Wed, 14 Aug 2019 14:56:14 -0400
Subject: [PATCH 22/45] Doc

---
 docs/source/index.rst                        |   1 +
 docs/source/model_doc/roberta.rst            |  36 ++++
 docs/source/pretrained_models.rst            | 196 ++++++++++---------
 pytorch_transformers/modeling_roberta.py     | 162 ++++++++++++++-
 pytorch_transformers/tokenization_roberta.py |   3 +-
 pytorch_transformers/tokenization_utils.py   |  54 +++--
 6 files changed, 327 insertions(+), 125 deletions(-)
 create mode 100644 docs/source/model_doc/roberta.rst

diff --git a/docs/source/index.rst b/docs/source/index.rst
index b613596331..37b3509fe4 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -47,3 +47,4 @@ The library currently contains PyTorch implementations, pre-trained model weight
     model_doc/gpt2
     model_doc/xlm
     model_doc/xlnet
+    model_doc/roberta
diff --git a/docs/source/model_doc/roberta.rst b/docs/source/model_doc/roberta.rst
new file mode 100644
index 0000000000..e2de917e35
--- /dev/null
+++ b/docs/source/model_doc/roberta.rst
@@ -0,0 +1,36 @@
+RoBERTa
+----------------------------------------------------
+
+``RobertaConfig``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.RobertaConfig
+    :members:
+
+
+``RobertaTokenizer``
+~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.RobertaTokenizer
+    :members:
+
+
+``RobertaModel``
+~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.RobertaModel
+    :members:
+
+
+``RobertaForMaskedLM``
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.RobertaForMaskedLM
+    :members:
+
+
+``RobertaForSequenceClassification``
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: pytorch_transformers.RobertaForSequenceClassification
+    :members:
diff --git a/docs/source/pretrained_models.rst b/docs/source/pretrained_models.rst
index b23a96ff7c..987882d12e 100644
--- a/docs/source/pretrained_models.rst
+++ b/docs/source/pretrained_models.rst
@@ -4,97 +4,109 @@ Pretrained models
 Here is the full list of the currently provided pretrained models together with a short presentation of each model.
 
 
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-| Architecture      | Shortcut name                                              | Details of the model                                                                                                      |
-+===================+============================================================+===========================================================================================================================+
-| BERT              | ``bert-base-uncased``                                      | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
-|                   |                                                            | Trained on lower-cased English text                                                                                       |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-uncased``                                     | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
-|                   |                                                            | Trained on lower-cased English text                                                                                       |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-cased``                                        | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
-|                   |                                                            | Trained on cased English text                                                                                             |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-cased``                                       | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
-|                   |                                                            | Trained on cased English text                                                                                             |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-multilingual-uncased``                         | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters                                               |
-|                   |                                                            | Trained on lower-cased text in the top 102 languages with the largest Wikipedias                                          |
-|                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__)                                   |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-multilingual-cased``                           | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters                                                    |
-|                   |                                                            | Trained on cased text in the top 104 languages with the largest Wikipedias                                                |
-|                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__)                                   |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-chinese``                                      | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
-|                   |                                                            | Trained on cased Chinese Simplified and Traditional text                                                                  |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-german-cased``                                 | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
-|                   |                                                            | Trained on cased German text by Deepset.ai                                                                                |
-|                   |                                                            | (see `details on deepset.ai website <https://deepset.ai/german-bert>`__)                                                  |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-uncased-whole-word-masking``                  | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
-|                   |                                                            | Trained on lower-cased English text using Whole-Word-Masking                                                              |
-|                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__)                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-cased-whole-word-masking``                    | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
-|                   |                                                            | Trained on cased English text using Whole-Word-Masking                                                                    |
-|                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__)                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
-|                   |                                                            | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD (see details of fine-tuning in the                |
-|                   |                                                            | `example section <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`__)                           |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
-|                   |                                                            | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                     |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)       |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``bert-base-cased-finetuned-mrpc``                         | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
-|                   |                                                            | The ``bert-base-cased`` model fine-tuned on MRPC                                                                          |
-|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)       |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-| GPT               | ``openai-gpt``                                             | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
-|                   |                                                            | OpenAI GPT English model                                                                                                  |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-| GPT-2             | ``gpt2``                                                   | 12-layer, 768-hidden, 12-heads, 117M parameters                                                                           |
-|                   |                                                            | OpenAI GPT-2 English model                                                                                                |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``gpt2-medium``                                            | 24-layer, 1024-hidden, 16-heads, 345M parameters                                                                          |
-|                   |                                                            | OpenAI's Medium-sized GPT-2 English model                                                                                 |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-| Transformer-XL    | ``transfo-xl-wt103``                                       | 18-layer, 1024-hidden, 16-heads, 257M parameters                                                                          |
-|                   |                                                            | English model trained on wikitext-103                                                                                     |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-| XLNet             | ``xlnet-base-cased``                                       | 12-layer, 768-hidden, 12-heads, 110M parameters                                                                           |
-|                   |                                                            | XLNet English model                                                                                                       |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlnet-large-cased``                                      | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                          |
-|                   |                                                            | XLNet Large English model                                                                                                 |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-| XLM               | ``xlm-mlm-en-2048``                                        | 12-layer, 1024-hidden, 8-heads                                                                                            |
-|                   |                                                            | XLM English model                                                                                                         |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-ende-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
-|                   |                                                            | XLM English-German Multi-language model                                                                                   |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-enfr-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
-|                   |                                                            | XLM English-French Multi-language model                                                                                   |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-enro-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
-|                   |                                                            | XLM English-Romanian Multi-language model                                                                                 |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-xnli15-1024``                                    | 12-layer, 1024-hidden, 8-heads                                                                                            |
-|                   |                                                            | XLM Model pre-trained with MLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                   |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-mlm-tlm-xnli15-1024``                                | 12-layer, 1024-hidden, 8-heads                                                                                            |
-|                   |                                                            | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.             |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-clm-enfr-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
-|                   |                                                            | XLM English model trained with CLM (Causal Language Modeling)                                                             |
-|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
-|                   | ``xlm-clm-ende-1024``                                      | 12-layer, 1024-hidden, 8-heads                                                                                            |
-|                   |                                                            | XLM English-German Multi-language model trained with CLM (Causal Language Modeling)                                       |
-+-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------+
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Architecture      | Shortcut name                                              | Details of the model                                                                                                                  |
++===================+============================================================+=======================================================================================================================================+
+| BERT              | ``bert-base-uncased``                                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on lower-cased English text.                                                                                                |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-uncased``                                     | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
+|                   |                                                            | | Trained on lower-cased English text.                                                                                                |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-cased``                                        | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on cased English text.                                                                                                      |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-cased``                                       | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
+|                   |                                                            | | Trained on cased English text.                                                                                                      |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-multilingual-uncased``                         | | (Original, not recommended) 12-layer, 768-hidden, 12-heads, 110M parameters.                                                        |
+|                   |                                                            | | Trained on lower-cased text in the top 102 languages with the largest Wikipedias                                                    |
+|                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__).                                              |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-multilingual-cased``                           | | (New, **recommended**) 12-layer, 768-hidden, 12-heads, 110M parameters.                                                             |
+|                   |                                                            | | Trained on cased text in the top 104 languages with the largest Wikipedias                                                          |
+|                   |                                                            | (see `details <https://github.com/google-research/bert/blob/master/multilingual.md>`__).                                              |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-chinese``                                      | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on cased Chinese Simplified and Traditional text.                                                                           |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-german-cased``                                 | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | Trained on cased German text by Deepset.ai                                                                                          |
+|                   |                                                            | (see `details on deepset.ai website <https://deepset.ai/german-bert>`__).                                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-uncased-whole-word-masking``                  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
+|                   |                                                            | | Trained on lower-cased English text using Whole-Word-Masking                                                                        |
+|                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__).                                                                    |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-cased-whole-word-masking``                    | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
+|                   |                                                            | | Trained on cased English text using Whole-Word-Masking                                                                              |
+|                   |                                                            | (see `details <https://github.com/google-research/bert/#bert>`__).                                                                    |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-uncased-whole-word-masking-finetuned-squad``  | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
+|                   |                                                            | | The ``bert-large-uncased-whole-word-masking`` model fine-tuned on SQuAD                                                             |
+|                   |                                                            | (see details of fine-tuning in the `example section <https://github.com/huggingface/pytorch-transformers/tree/master/examples>`__).   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-large-cased-whole-word-masking-finetuned-squad``    | | 24-layer, 1024-hidden, 16-heads, 340M parameters                                                                                    |
+|                   |                                                            | | The ``bert-large-cased-whole-word-masking`` model fine-tuned on SQuAD                                                               |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``bert-base-cased-finetuned-mrpc``                         | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | The ``bert-base-cased`` model fine-tuned on MRPC                                                                                    |
+|                   |                                                            | (see `details of fine-tuning in the example section <https://huggingface.co/pytorch-transformers/examples.html>`__)                   |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| GPT               | ``openai-gpt``                                             | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | OpenAI GPT English model                                                                                                            |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| GPT-2             | ``gpt2``                                                   | | 12-layer, 768-hidden, 12-heads, 117M parameters.                                                                                    |
+|                   |                                                            | | OpenAI GPT-2 English model                                                                                                          |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``gpt2-medium``                                            | | 24-layer, 1024-hidden, 16-heads, 345M parameters.                                                                                   |
+|                   |                                                            | | OpenAI's Medium-sized GPT-2 English model                                                                                           |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| Transformer-XL    | ``transfo-xl-wt103``                                       | | 18-layer, 1024-hidden, 16-heads, 257M parameters.                                                                                   |
+|                   |                                                            | | English model trained on wikitext-103                                                                                               |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| XLNet             | ``xlnet-base-cased``                                       | | 12-layer, 768-hidden, 12-heads, 110M parameters.                                                                                    |
+|                   |                                                            | | XLNet English model                                                                                                                 |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlnet-large-cased``                                      | | 24-layer, 1024-hidden, 16-heads, 340M parameters.                                                                                   |
+|                   |                                                            | | XLNet Large English model                                                                                                           |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| XLM               | ``xlm-mlm-en-2048``                                        | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   |                                                            | | XLM English model                                                                                                                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-ende-1024``                                      | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   |                                                            | | XLM English-German Multi-language model                                                                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-enfr-1024``                                      | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   |                                                            | | XLM English-French Multi-language model                                                                                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-enro-1024``                                      | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   |                                                            | | XLM English-Romanian Multi-language model                                                                                           |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-xnli15-1024``                                    | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   |                                                            | | XLM Model pre-trained with MLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                             |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-mlm-tlm-xnli15-1024``                                | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   |                                                            | | XLM Model pre-trained with MLM + TLM on the `15 XNLI languages <https://github.com/facebookresearch/XNLI>`__.                       |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-clm-enfr-1024``                                      | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   |                                                            | | XLM English model trained with CLM (Causal Language Modeling)                                                                       |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``xlm-clm-ende-1024``                                      | | 12-layer, 1024-hidden, 8-heads                                                                                                      |
+|                   |                                                            | | XLM English-German Multi-language model trained with CLM (Causal Language Modeling)                                                 |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+| RoBERTa           | ``roberta-base``                                           | | 12-layer, 768-hidden, 12-heads, 125M parameters                                                                                     |
+|                   |                                                            | | RoBERTa using the BERT-base architecture                                                                                            |
+|                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``roberta-large``                                          | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
+|                   |                                                            | | RoBERTa using the BERT-large architecture                                                                                           |
+|                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
+|                   +------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
+|                   | ``roberta-large-mnli``                                     | | 24-layer, 1024-hidden, 16-heads, 355M parameters                                                                                    |
+|                   |                                                            | | ``roberta-large`` fine-tuned on `MNLI <http://www.nyu.edu/projects/bowman/multinli/>`__.                                            |
+|                   |                                                            | (see `details <https://github.com/pytorch/fairseq/tree/master/examples/roberta>`__)                                                   |
++-------------------+------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------+
 
 .. <https://huggingface.co/pytorch-transformers/examples.html>`__
\ No newline at end of file
diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py
index 6cd4bc2d35..ebf701ead6 100644
--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -29,6 +29,8 @@ from pytorch_transformers.modeling_bert import (BertConfig, BertEmbeddings,
                                                 BertLayerNorm, BertModel,
                                                 BertPreTrainedModel, gelu)
 
+from pytorch_transformers.modeling_utils import add_start_docstrings
+
 logger = logging.getLogger(__name__)
 
 ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP = {
@@ -65,11 +67,93 @@ class RobertaEmbeddings(BertEmbeddings):
 class RobertaConfig(BertConfig):
     pretrained_config_archive_map = ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP
 
+
+ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
+    `RoBERTa: A Robustly Optimized BERT Pretraining Approach`_
+    by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer,
+    Veselin Stoyanov. It is based on Google's BERT model released in 2018.
+    
+    It builds on BERT and modifies key hyperparameters, removing the next-sentence pretraining
+    objective and training with much larger mini-batches and learning rates.
+    
+    This implementation is the same as BertModel with a tiny embeddings tweak as well as a setup for Roberta pretrained 
+    models.
+
+    This model is a PyTorch `torch.nn.Module`_ sub-class. Use it as a regular PyTorch Module and
+    refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    .. _`RoBERTa: A Robustly Optimized BERT Pretraining Approach`:
+        https://arxiv.org/abs/1907.11692
+
+    .. _`torch.nn.Module`:
+        https://pytorch.org/docs/stable/nn.html#module
+
+    Parameters:
+        config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the 
+            model.
+"""
+
+ROBERTA_INPUTS_DOCSTRING = r"""
+    Inputs:
+        **input_ids**: ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of input sequence tokens in the vocabulary.
+            To match pre-training, RoBERTa input sequence should be formatted with [CLS] and [SEP] tokens as follows:
+
+            (a) For sequence pairs:
+
+                ``tokens:         [CLS] is this jack ##son ##ville ? [SEP][SEP] no it is not . [SEP]``
+
+            (b) For single sequences:
+
+                ``tokens:         [CLS] the dog is hairy . [SEP]``
+
+            Fully encoded sequences or sequence pairs can be obtained using the RobertaTokenizer.encode function with 
+            the ``add_special_tokens`` parameter set to ``True``.
+            See :func:`pytorch_transformers.PreTrainedTokenizer.encode` and
+            :func:`pytorch_transformers.PreTrainedTokenizer.convert_tokens_to_ids` for details.
+        **position_ids**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Selected in the range ``[0, config.max_position_embeddings - 1[``.
+        **attention_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(batch_size, sequence_length)``:
+            Mask to avoid performing attention on padding token indices.
+            Mask values selected in ``[0, 1]``:
+            ``1`` for tokens that are NOT MASKED, ``0`` for MASKED tokens.
+        **head_mask**: (`optional`) ``torch.FloatTensor`` of shape ``(num_heads,)`` or ``(num_layers, num_heads)``:
+            Mask to nullify selected heads of the self-attention modules.
+            Mask values selected in ``[0, 1]``:
+            ``1`` indicates the head is **not masked**, ``0`` indicates the head is **masked**.
+"""
+
+@add_start_docstrings("The bare RoBERTa Model transformer outputing raw hidden-states without any specific head on top.",
+                      ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
 class RobertaModel(BertModel):
-    """
-    Same as BertModel with:
-    - a tiny embeddings tweak.
-    - setup for Roberta pretrained models
+    r"""
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **last_hidden_state**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, hidden_size)``
+            Sequence of hidden-states at the output of the last layer of the model.
+        **pooler_output**: ``torch.FloatTensor`` of shape ``(batch_size, hidden_size)``
+            Last layer hidden-state of the first token of the sequence (classification token)
+            further processed by a Linear layer and a Tanh activation function. The Linear
+            layer weights are trained from the next sentence prediction (classification)
+            objective during Bert pretraining. This output is usually *not* a good summary
+            of the semantic content of the input, you're often better with averaging or pooling
+            the sequence of hidden-states for the whole input sequence.
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        model = RobertaModel.from_pretrained('roberta-base')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids)
+        last_hidden_states = outputs[0]  # The last hidden-state is the first element of the output tuple
+
     """
     config_class = RobertaConfig
     pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -82,9 +166,37 @@ class RobertaModel(BertModel):
         self.apply(self.init_weights)
 
 
+@add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
+    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
 class RobertaForMaskedLM(BertPreTrainedModel):
-    """
-    Roberta Model with a `language modeling` head on top.
+    r"""
+        **masked_lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
+            Labels for computing the masked language modeling loss.
+            Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
+            Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
+            in ``[0, ..., config.vocab_size]``
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Masked language modeling loss.
+        **prediction_scores**: ``torch.FloatTensor`` of shape ``(batch_size, sequence_length, config.vocab_size)``
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
+        model = RobertaForMaskedLM.from_pretrained('roberta-base')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, masked_lm_labels=input_ids)
+        loss, prediction_scores = outputs[:2]
+
     """
     config_class = RobertaConfig
     pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
@@ -112,14 +224,14 @@ class RobertaForMaskedLM(BertPreTrainedModel):
         sequence_output = outputs[0]
         prediction_scores = self.lm_head(sequence_output)
 
-        outputs = (prediction_scores,) + outputs[2:]
+        outputs = (prediction_scores,) + outputs[2:]  # Add hidden states and attention if they are here
 
         if masked_lm_labels is not None:
             loss_fct = CrossEntropyLoss(ignore_index=-1)
             masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
             outputs = (masked_lm_loss,) + outputs
 
-        return outputs
+        return outputs  # (masked_lm_loss), prediction_scores, (hidden_states), (attentions)
 
 
 class RobertaLMHead(nn.Module):
@@ -144,9 +256,39 @@ class RobertaLMHead(nn.Module):
         return x
 
 
+@add_start_docstrings("""RoBERTa Model transformer with a sequence classification/regression head on top (a linear layer 
+    on top of the pooled output) e.g. for GLUE tasks. """,
+    ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)
 class RobertaForSequenceClassification(BertPreTrainedModel):
-    """
-    Roberta Model with a classifier head on top.
+    r"""
+        **labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size,)``:
+            Labels for computing the sequence classification/regression loss.
+            Indices should be in ``[0, ..., config.num_labels]``.
+            If ``config.num_labels == 1`` a regression loss is computed (Mean-Square loss),
+            If ``config.num_labels > 1`` a classification loss is computed (Cross-Entropy).
+
+    Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
+        **loss**: (`optional`, returned when ``labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
+            Classification (or regression if config.num_labels==1) loss.
+        **logits**: ``torch.FloatTensor`` of shape ``(batch_size, config.num_labels)``
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        **hidden_states**: (`optional`, returned when ``config.output_hidden_states=True``)
+            list of ``torch.FloatTensor`` (one for the output of each layer + the output of the embeddings)
+            of shape ``(batch_size, sequence_length, hidden_size)``:
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        **attentions**: (`optional`, returned when ``config.output_attentions=True``)
+            list of ``torch.FloatTensor`` (one for each layer) of shape ``(batch_size, num_heads, sequence_length, sequence_length)``:
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
+
+    Examples::
+
+        tokenizer = RoertaTokenizer.from_pretrained('roberta-base')
+        model = RobertaForSequenceClassification.from_pretrained('roberta-base')
+        input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute")).unsqueeze(0)  # Batch size 1
+        labels = torch.tensor([1]).unsqueeze(0)  # Batch size 1
+        outputs = model(input_ids, labels=labels)
+        loss, logits = outputs[:2]
+
     """
     config_class = RobertaConfig
     pretrained_model_archive_map = ROBERTA_PRETRAINED_MODEL_ARCHIVE_MAP
diff --git a/pytorch_transformers/tokenization_roberta.py b/pytorch_transformers/tokenization_roberta.py
index 1db8013183..edf4717c89 100644
--- a/pytorch_transformers/tokenization_roberta.py
+++ b/pytorch_transformers/tokenization_roberta.py
@@ -65,8 +65,7 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 
 class RobertaTokenizer(PreTrainedTokenizer):
     """
-    RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities:
-        - Byte-level BPE
+    RoBERTa BPE tokenizer, derived from the GPT-2 tokenizer. Peculiarities: Byte-level BPE
     """
     vocab_files_names = VOCAB_FILES_NAMES
     pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
diff --git a/pytorch_transformers/tokenization_utils.py b/pytorch_transformers/tokenization_utils.py
index 7bb9fd9d29..74d50b385d 100644
--- a/pytorch_transformers/tokenization_utils.py
+++ b/pytorch_transformers/tokenization_utils.py
@@ -180,9 +180,10 @@ class PreTrainedTokenizer(object):
 
     @classmethod
     def from_pretrained(cls, *inputs, **kwargs):
-        r""" Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
+        r"""
+        Instantiate a :class:`~pytorch_transformers.PreTrainedTokenizer` (or a derived class) from a predefined tokenizer.
 
-        Parameters:
+        Args:
             pretrained_model_name_or_path: either:
 
                 - a string with the `shortcut name` of a predefined tokenizer to load from cache or download, e.g.: ``bert-base-uncased``.
@@ -383,14 +384,15 @@ class PreTrainedTokenizer(object):
 
 
     def add_tokens(self, new_tokens):
-        """ Add a list of new tokens to the tokenizer class. If the new tokens are not in the
+        """
+        Add a list of new tokens to the tokenizer class. If the new tokens are not in the
         vocabulary, they are added to it with indices starting from length of the current vocabulary.
 
-            Parameters:
-                new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+        Args:
+            new_tokens: list of string. Each string is a token to add. Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
 
-            Returns:
-                Number of tokens added to the vocabulary.
+        Returns:
+            Number of tokens added to the vocabulary.
 
         Examples::
 
@@ -422,17 +424,20 @@ class PreTrainedTokenizer(object):
 
 
     def add_special_tokens(self, special_tokens_dict):
-        """ Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
-            to class attributes. If special tokens are NOT in the vocabulary, they are added
-            to it (indexed starting from the last index of the current vocabulary).
+        """
+        Add a dictionary of special tokens (eos, pad, cls...) to the encoder and link them
+        to class attributes. If special tokens are NOT in the vocabulary, they are added
+        to it (indexed starting from the last index of the current vocabulary).
 
-            Parameters:
-                special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes: [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``, ``additional_special_tokens``].
-                
-                    Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+        Args:
+            special_tokens_dict: dict of string. Keys should be in the list of predefined special attributes:
+                [``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``, ``mask_token``,
+                ``additional_special_tokens``].
 
-            Returns:
-                Number of tokens added to the vocabulary.
+                Tokens are only added if they are not already in the vocabulary (tested by checking if the tokenizer assign the index of the ``unk_token`` to them).
+
+        Returns:
+            Number of tokens added to the vocabulary.
 
         Examples::
 
@@ -520,9 +525,16 @@ class PreTrainedTokenizer(object):
         raise NotImplementedError
 
     def encode(self, text, text_pair=None, add_special_tokens=False):
-        """ Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
+        """
+        Converts a string in a sequence of ids (integer), using the tokenizer and vocabulary.
         
-        Same doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
+        Same as doing ``self.convert_tokens_to_ids(self.tokenize(text))``.
+
+        Args:
+            text: The first sequence to be encoded.
+            text_pair: Optional second sequence to be encoded.
+            add_special_tokens: if set to ``True``, the sequences will be encoded with the special tokens relative
+                to their model.
         """
         if text_pair is None:
             if add_special_tokens:
@@ -577,9 +589,9 @@ class PreTrainedTokenizer(object):
         return ' '.join(self.convert_ids_to_tokens(tokens))
 
     def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
-        """ Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
-            with options to remove special tokens and clean up tokenization spaces.
-
+        """
+        Converts a sequence of ids (integer) in a string, using the tokenizer and vocabulary
+        with options to remove special tokens and clean up tokenization spaces.
         Similar to doing ``self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))``.
         """
         filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)

From 8308170156bdf41134fd0a8027f63f57f9e6a8d6 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 15 Aug 2019 10:29:04 -0400
Subject: [PATCH 23/45] Warning for RoBERTa sequences encoded without special
 tokens.

---
 pytorch_transformers/modeling_roberta.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py
index ebf701ead6..adb04b4b3a 100644
--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -165,6 +165,13 @@ class RobertaModel(BertModel):
         self.embeddings = RobertaEmbeddings(config)
         self.apply(self.init_weights)
 
+    def forward(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None, head_mask=None):
+        if input_ids[:, 0].sum().item() != 0:
+            logger.warning("A sequence with no special tokens has been passed to the RoBERTa model. "
+                           "This model requires special tokens in order to work. "
+                           "Please specify add_special_tokens=True in your encoding.")
+        return super(RobertaModel, self).forward(input_ids, token_type_ids, attention_mask, position_ids, head_mask)
+
 
 @add_start_docstrings("""RoBERTa Model with a `language modeling` head on top. """,
     ROBERTA_START_DOCSTRING, ROBERTA_INPUTS_DOCSTRING)

From fe02e45e488a4f067605cf9768171358de9726d3 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 15 Aug 2019 11:15:08 -0400
Subject: [PATCH 24/45] Release: 1.1.0

---
 pytorch_transformers/__init__.py | 2 +-
 setup.py                         | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pytorch_transformers/__init__.py b/pytorch_transformers/__init__.py
index 38423de14b..62e3b8c47b 100644
--- a/pytorch_transformers/__init__.py
+++ b/pytorch_transformers/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "1.0.0"
+__version__ = "1.1.0"
 from .tokenization_auto import AutoTokenizer
 from .tokenization_bert import BertTokenizer, BasicTokenizer, WordpieceTokenizer
 from .tokenization_openai import OpenAIGPTTokenizer
diff --git a/setup.py b/setup.py
index 4c23714980..c9f80fc224 100644
--- a/setup.py
+++ b/setup.py
@@ -38,10 +38,10 @@ from setuptools import find_packages, setup
 
 setup(
     name="pytorch_transformers",
-    version="1.0.0",
-    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Tim Rault, Google AI Language Team Authors, Open AI team Authors",
+    version="1.1.0",
+    author="Thomas Wolf, Lysandre Debut, Victor Sanh, Julien Chaumond, Google AI Language Team Authors, Open AI team Authors",
     author_email="thomas@huggingface.co",
-    description="Repository of pre-trained NLP Transformer models: BERT, GPT & GPT-2, Transformer-XL, XLNet and XLM",
+    description="Repository of pre-trained NLP Transformer models: BERT & RoBERTa, GPT & GPT-2, Transformer-XL, XLNet and XLM",
     long_description=open("README.md", "r", encoding='utf-8').read(),
     long_description_content_type="text/markdown",
     keywords='NLP deep learning transformer pytorch BERT GPT GPT-2 google openai CMU',

From e24e19ce3bbbc3fe317e4d277b919cd1cb31fc47 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 15 Aug 2019 14:02:11 -0400
Subject: [PATCH 25/45] Added RoBERTa to AutoModel/AutoConfig

---
 pytorch_transformers/modeling_auto.py | 33 +++++++++++++++++----------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index 64b151e3a3..47c37a57d6 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -29,6 +29,7 @@ from .modeling_gpt2 import GPT2Config, GPT2Model
 from .modeling_transfo_xl import TransfoXLConfig, TransfoXLModel
 from .modeling_xlnet import XLNetConfig, XLNetModel
 from .modeling_xlm import XLMConfig, XLMModel
+from .modeling_roberta import RobertaConfig, RobertaModel
 
 from .modeling_utils import PreTrainedModel, SequenceSummary
 
@@ -51,6 +52,7 @@ class AutoConfig(object):
             - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
             - contains `xlnet`: XLNetConfig (XLNet model)
             - contains `xlm`: XLMConfig (XLM model)
+            - contains `roberta`: RobertaConfig (RoBERTa model)
 
         This class cannot be instantiated using `__init__()` (throw an error).
     """
@@ -71,6 +73,7 @@ class AutoConfig(object):
             - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
             - contains `xlnet`: XLNetConfig (XLNet model)
             - contains `xlm`: XLMConfig (XLM model)
+            - contains `roberta`: RobertaConfig (RoBERTa model)
 
         Params:
             **pretrained_model_name_or_path**: either:
@@ -119,6 +122,8 @@ class AutoConfig(object):
             return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif 'xlm' in pretrained_model_name_or_path:
             return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
@@ -137,12 +142,13 @@ class AutoModel(object):
 
         The base model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `bert`: BertConfig (Bert model)
-            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
-            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
-            - contains `xlnet`: XLNetConfig (XLNet model)
-            - contains `xlm`: XLMConfig (XLM model)
+            - contains `bert`: BertModel (Bert model)
+            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
+            - contains `xlnet`: XLNetModel (XLNet model)
+            - contains `xlm`: XLMModel (XLM model)
+            - contains `roberta`: RobertaModel (RoBERTa model)
 
         This class cannot be instantiated using `__init__()` (throw an error).
     """
@@ -157,12 +163,13 @@ class AutoModel(object):
 
         The base model class to instantiate is selected as the first pattern matching
         in the `pretrained_model_name_or_path` string (in the following order):
-            - contains `bert`: BertConfig (Bert model)
-            - contains `openai-gpt`: OpenAIGPTConfig (OpenAI GPT model)
-            - contains `gpt2`: GPT2Config (OpenAI GPT-2 model)
-            - contains `transfo-xl`: TransfoXLConfig (Transformer-XL model)
-            - contains `xlnet`: XLNetConfig (XLNet model)
-            - contains `xlm`: XLMConfig (XLM model)
+            - contains `bert`: BertModel (Bert model)
+            - contains `openai-gpt`: OpenAIGPTModel (OpenAI GPT model)
+            - contains `gpt2`: GPT2Model (OpenAI GPT-2 model)
+            - contains `transfo-xl`: TransfoXLModel (Transformer-XL model)
+            - contains `xlnet`: XLNetModel (XLNet model)
+            - contains `xlm`: XLMModel (XLM model)
+            - contains `roberta`: RobertaModel (RoBERTa model)
 
             The model is set in evaluation mode by default using `model.eval()` (Dropout modules are deactivated)
             To train the model, you should first set it back in training mode with `model.train()`
@@ -230,6 +237,8 @@ class AutoModel(object):
             return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'xlm' in pretrained_model_name_or_path:
             return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "

From 83dba0b67bd8d142e830eab7aa6538b4dc50e1ef Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 15 Aug 2019 17:07:07 -0400
Subject: [PATCH 26/45] Added RoBERTa tokenizer to AutoTokenizer

---
 pytorch_transformers/modeling_auto.py     | 4 ++--
 pytorch_transformers/tokenization_auto.py | 7 ++++++-
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index 47c37a57d6..7c96b7a287 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -127,7 +127,7 @@ class AutoConfig(object):
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm'".format(pretrained_model_name_or_path))
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
 
 
 class AutoModel(object):
@@ -242,4 +242,4 @@ class AutoModel(object):
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm'".format(pretrained_model_name_or_path))
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))
diff --git a/pytorch_transformers/tokenization_auto.py b/pytorch_transformers/tokenization_auto.py
index acbe1cebc6..adb8f87cd7 100644
--- a/pytorch_transformers/tokenization_auto.py
+++ b/pytorch_transformers/tokenization_auto.py
@@ -24,6 +24,7 @@ from .tokenization_gpt2 import GPT2Tokenizer
 from .tokenization_transfo_xl import TransfoXLTokenizer
 from .tokenization_xlnet import XLNetTokenizer
 from .tokenization_xlm import XLMTokenizer
+from .tokenization_roberta import RobertaTokenizer
 
 logger = logging.getLogger(__name__)
 
@@ -44,6 +45,7 @@ class AutoTokenizer(object):
             - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
             - contains `xlnet`: XLNetTokenizer (XLNet model)
             - contains `xlm`: XLMTokenizer (XLM model)
+            - contains `roberta`: RobertaTokenizer (RoBERTa model)
 
         This class cannot be instantiated using `__init__()` (throw an error).
     """
@@ -64,6 +66,7 @@ class AutoTokenizer(object):
             - contains `transfo-xl`: TransfoXLTokenizer (Transformer-XL model)
             - contains `xlnet`: XLNetTokenizer (XLNet model)
             - contains `xlm`: XLMTokenizer (XLM model)
+            - contains `roberta`: RobertaTokenizer (XLM model)
 
         Params:
             **pretrained_model_name_or_path**: either:
@@ -94,7 +97,9 @@ class AutoTokenizer(object):
             return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'xlm' in pretrained_model_name_or_path:
             return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'roberta' in pretrained_model_name_or_path:
+            return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
-                         "'xlm'".format(pretrained_model_name_or_path))
+                         "'xlm', 'roberta'".format(pretrained_model_name_or_path))

From 9d0029e215f5ad0836d6be87458aab5142783af4 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Thu, 15 Aug 2019 17:17:35 -0400
Subject: [PATCH 27/45] Added RoBERTa example to README

---
 README.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 389c2f25ad..3389e10593 100644
--- a/README.md
+++ b/README.md
@@ -83,7 +83,8 @@ MODELS = [(BertModel,       BertTokenizer,      'bert-base-uncased'),
           (GPT2Model,       GPT2Tokenizer,      'gpt2'),
           (TransfoXLModel,  TransfoXLTokenizer, 'transfo-xl-wt103'),
           (XLNetModel,      XLNetTokenizer,     'xlnet-base-cased'),
-          (XLMModel,        XLMTokenizer,       'xlm-mlm-enfr-1024')]
+          (XLMModel,        XLMTokenizer,       'xlm-mlm-enfr-1024'),
+          (RobertaModel,    RobertaTokenizer,   'roberta-base')]
 
 # Let's encode some text in a sequence of hidden-states using each model:
 for model_class, tokenizer_class, pretrained_weights in MODELS:

From b8ff56896ccbd27a54035a90a3bc278a44541a74 Mon Sep 17 00:00:00 2001
From: wangfei <1140554608@qq.com>
Date: Fri, 16 Aug 2019 12:11:05 +0800
Subject: [PATCH 28/45] Fix bug of multi-gpu training in lm finetuning

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 2 +-
 examples/lm_finetuning/simple_lm_finetuning.py     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 9fcc5f2cb1..7c40342f18 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -320,7 +320,7 @@ def main():
                     global_step += 1
 
     # Save a trained model
-    if  n_gpu > 1 and torch.distributed.get_rank() == 0  or n_gpu <=1 :
+    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
         logging.info("** ** * Saving fine-tuned model ** ** * ")
         model.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index ba5f832827..25333de0ed 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -507,7 +507,7 @@ def main():
 
     if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
         raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
-    if not os.path.exists(args.output_dir) and ( n_gpu > 1 and torch.distributed.get_rank() == 0  or n_gpu <=1 ):
+    if not os.path.exists(args.output_dir) and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         os.makedirs(args.output_dir)
 
     tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)
@@ -608,7 +608,7 @@ def main():
                     global_step += 1
 
         # Save a trained model
-        if args.do_train and ( n_gpu > 1 and torch.distributed.get_rank() == 0  or n_gpu <=1):
+        if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
             logger.info("** ** * Saving fine - tuned model ** ** * ")
             model.save_pretrained(args.output_dir)
             tokenizer.save_pretrained(args.output_dir)

From ab05280666c9e1cfbbb23122825f3a41b7ff82c3 Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 16 Aug 2019 09:53:26 -0400
Subject: [PATCH 29/45] Order of strings in AutoModel/AutoTokenizer updated.

---
 pytorch_transformers/modeling_auto.py     | 12 ++++++------
 pytorch_transformers/tokenization_auto.py |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/pytorch_transformers/modeling_auto.py b/pytorch_transformers/modeling_auto.py
index 7c96b7a287..516107c40b 100644
--- a/pytorch_transformers/modeling_auto.py
+++ b/pytorch_transformers/modeling_auto.py
@@ -110,7 +110,9 @@ class AutoConfig(object):
             assert unused_kwargs == {'foo': False}
 
         """
-        if 'bert' in pretrained_model_name_or_path:
+        if 'roberta' in pretrained_model_name_or_path:
+            return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
             return BertConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif 'openai-gpt' in pretrained_model_name_or_path:
             return OpenAIGPTConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
@@ -122,8 +124,6 @@ class AutoConfig(object):
             return XLNetConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
         elif 'xlm' in pretrained_model_name_or_path:
             return XLMConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
-            return RobertaConfig.from_pretrained(pretrained_model_name_or_path, **kwargs)
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
@@ -225,7 +225,9 @@ class AutoModel(object):
             model = AutoModel.from_pretrained('./tf_model/bert_tf_checkpoint.ckpt.index', from_tf=True, config=config)
 
         """
-        if 'bert' in pretrained_model_name_or_path:
+        if 'roberta' in pretrained_model_name_or_path:
+            return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
             return BertModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'openai-gpt' in pretrained_model_name_or_path:
             return OpenAIGPTModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
@@ -237,8 +239,6 @@ class AutoModel(object):
             return XLNetModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
         elif 'xlm' in pretrained_model_name_or_path:
             return XLMModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
-            return RobertaModel.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs)
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "
diff --git a/pytorch_transformers/tokenization_auto.py b/pytorch_transformers/tokenization_auto.py
index adb8f87cd7..b4b6336952 100644
--- a/pytorch_transformers/tokenization_auto.py
+++ b/pytorch_transformers/tokenization_auto.py
@@ -85,7 +85,9 @@ class AutoTokenizer(object):
             config = AutoTokenizer.from_pretrained('./test/bert_saved_model/')  # E.g. tokenizer was saved using `save_pretrained('./test/saved_model/')`
 
         """
-        if 'bert' in pretrained_model_name_or_path:
+        if 'roberta' in pretrained_model_name_or_path:
+            return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        elif 'bert' in pretrained_model_name_or_path:
             return BertTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'openai-gpt' in pretrained_model_name_or_path:
             return OpenAIGPTTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
@@ -97,8 +99,6 @@ class AutoTokenizer(object):
             return XLNetTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
         elif 'xlm' in pretrained_model_name_or_path:
             return XLMTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
-        elif 'roberta' in pretrained_model_name_or_path:
-            return RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
 
         raise ValueError("Unrecognized model identifier in {}. Should contains one of "
                          "'bert', 'openai-gpt', 'gpt2', 'transfo-xl', 'xlnet', "

From 7e7fc53da5f230db379ece739457c81b2f50f13e Mon Sep 17 00:00:00 2001
From: LysandreJik <lysandre.debut@reseau.eseo.fr>
Date: Fri, 16 Aug 2019 11:02:10 -0400
Subject: [PATCH 30/45] Fixing run_glue example with RoBERTa

---
 examples/run_glue.py   | 2 +-
 examples/utils_glue.py | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index c0f70e0863..7fb0732e61 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -279,7 +279,7 @@ def load_and_cache_examples(args, task, tokenizer, evaluate=False):
             sep_token=tokenizer.sep_token,
             sep_token_extra=bool(args.model_type in ['roberta']),           # roberta uses an extra separator b/w pairs of sentences, cf. github.com/pytorch/fairseq/commit/1684e166e3da03f5b600dbb7855cb98ddfcd0805
             pad_on_left=bool(args.model_type in ['xlnet']),                 # pad on the left for xlnet
-            pad_token=tokenizer.encoder[tokenizer.pad_token] if args.model_type in ['roberta'] else tokenizer.vocab[tokenizer.pad_token],
+            pad_token=tokenizer.convert_tokens_to_ids([tokenizer.pad_token])[0],
             pad_token_segment_id=4 if args.model_type in ['xlnet'] else 0,
         )
         if args.local_rank in [-1, 0]:
diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index c955e4d0ce..e1649fa5af 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -425,9 +425,10 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
             # Account for [CLS], [SEP], [SEP] with "- 3"
             _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
         else:
-            # Account for [CLS] and [SEP] with "- 2"
-            if len(tokens_a) > max_seq_length - 2:
-                tokens_a = tokens_a[:(max_seq_length - 2)]
+            # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
+            special_tokens_count = 3 if sep_token_extra else 2
+            if len(tokens_a) > max_seq_length - special_tokens_count:
+                tokens_a = tokens_a[:(max_seq_length - special_tokens_count)]
 
         # The convention in BERT is:
         # (a) For sequence pairs:

From d8923270e6c497862f990a3c72e40cc1ddd01d4e Mon Sep 17 00:00:00 2001
From: Jason Phang <email@jasonphang.com>
Date: Fri, 16 Aug 2019 15:58:19 -0400
Subject: [PATCH 31/45] Correct truncation for RoBERTa in 2-input GLUE

---
 examples/utils_glue.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/utils_glue.py b/examples/utils_glue.py
index e1649fa5af..3e3f104672 100644
--- a/examples/utils_glue.py
+++ b/examples/utils_glue.py
@@ -422,8 +422,9 @@ def convert_examples_to_features(examples, label_list, max_seq_length,
             tokens_b = tokenizer.tokenize(example.text_b)
             # Modifies `tokens_a` and `tokens_b` in place so that the total
             # length is less than the specified length.
-            # Account for [CLS], [SEP], [SEP] with "- 3"
-            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+            # Account for [CLS], [SEP], [SEP] with "- 3". " -4" for RoBERTa.
+            special_tokens_count = 4 if sep_token_extra else 3
+            _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - special_tokens_count)
         else:
             # Account for [CLS] and [SEP] with "- 2" and with "- 3" for RoBERTa.
             special_tokens_count = 3 if sep_token_extra else 2

From 189ff9b66408a1758f3732725db3871322f3e0e6 Mon Sep 17 00:00:00 2001
From: Christophe Bourguignat <christophe.bourguignat@zelros.com>
Date: Sat, 17 Aug 2019 18:46:50 +0200
Subject: [PATCH 32/45] Update README after RoBERTa addition

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 3389e10593..7d2445fc11 100644
--- a/README.md
+++ b/README.md
@@ -76,7 +76,7 @@ import torch
 from pytorch_transformers import *
 
 # PyTorch-Transformers has a unified API
-# for 6 transformer architectures and 27 pretrained weights.
+# for 7 transformer architectures and 30 pretrained weights.
 #          Model          | Tokenizer          | Pretrained weights shortcut
 MODELS = [(BertModel,       BertTokenizer,      'bert-base-uncased'),
           (OpenAIGPTModel,  OpenAIGPTTokenizer, 'openai-gpt'),

From 00e9c4cc9616cab1666cab0a331b5d7e68946928 Mon Sep 17 00:00:00 2001
From: wangfei <1140554608@qq.com>
Date: Sun, 18 Aug 2019 11:02:02 +0800
Subject: [PATCH 33/45] Fix: save model/model.module

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 11 ++++++-----
 examples/lm_finetuning/simple_lm_finetuning.py     |  3 ++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 7c40342f18..1177d84cd4 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -155,12 +155,12 @@ def main():
                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                         "0 (default value): dynamic loss scaling.\n"
                         "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument("--warmup_steps", 
-                        default=0, 
+    parser.add_argument("--warmup_steps",
+                        default=0,
                         type=int,
                         help="Linear warmup over warmup_steps.")
-    parser.add_argument("--adam_epsilon", 
-                        default=1e-8, 
+    parser.add_argument("--adam_epsilon",
+                        default=1e-8,
                         type=float,
                         help="Epsilon for Adam optimizer.")
     parser.add_argument("--learning_rate",
@@ -322,7 +322,8 @@ def main():
     # Save a trained model
     if args.local_rank == -1 or torch.distributed.get_rank() == 0:
         logging.info("** ** * Saving fine-tuned model ** ** * ")
-        model.save_pretrained(args.output_dir)
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
 
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index 25333de0ed..9633640faf 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -610,7 +610,8 @@ def main():
         # Save a trained model
         if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
             logger.info("** ** * Saving fine - tuned model ** ** * ")
-            model.save_pretrained(args.output_dir)
+            model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+            model_to_save.save_pretrained(args.output_dir)
             tokenizer.save_pretrained(args.output_dir)
 
 

From 1ef41b83374ce5756e24746201d21432d7ecada0 Mon Sep 17 00:00:00 2001
From: wangfei <1140554608@qq.com>
Date: Sun, 18 Aug 2019 11:03:12 +0800
Subject: [PATCH 34/45] Revert "Fix: save model/model.module"

This reverts commit 00e9c4cc9616cab1666cab0a331b5d7e68946928.
---
 examples/lm_finetuning/finetune_on_pregenerated.py | 11 +++++------
 examples/lm_finetuning/simple_lm_finetuning.py     |  3 +--
 2 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 1177d84cd4..7c40342f18 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -155,12 +155,12 @@ def main():
                         help="Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                         "0 (default value): dynamic loss scaling.\n"
                         "Positive power of 2: static loss scaling value.\n")
-    parser.add_argument("--warmup_steps",
-                        default=0,
+    parser.add_argument("--warmup_steps", 
+                        default=0, 
                         type=int,
                         help="Linear warmup over warmup_steps.")
-    parser.add_argument("--adam_epsilon",
-                        default=1e-8,
+    parser.add_argument("--adam_epsilon", 
+                        default=1e-8, 
                         type=float,
                         help="Epsilon for Adam optimizer.")
     parser.add_argument("--learning_rate",
@@ -322,8 +322,7 @@ def main():
     # Save a trained model
     if args.local_rank == -1 or torch.distributed.get_rank() == 0:
         logging.info("** ** * Saving fine-tuned model ** ** * ")
-        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
-        model_to_save.save_pretrained(args.output_dir)
+        model.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
 
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index 9633640faf..25333de0ed 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -610,8 +610,7 @@ def main():
         # Save a trained model
         if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
             logger.info("** ** * Saving fine - tuned model ** ** * ")
-            model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
-            model_to_save.save_pretrained(args.output_dir)
+            model.save_pretrained(args.output_dir)
             tokenizer.save_pretrained(args.output_dir)
 
 

From 856a63da4d1f0f302633dc73e2d4a1f698bbafda Mon Sep 17 00:00:00 2001
From: wangfei <1140554608@qq.com>
Date: Sun, 18 Aug 2019 11:03:47 +0800
Subject: [PATCH 35/45] Fix: save model/model.module

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 3 ++-
 examples/lm_finetuning/simple_lm_finetuning.py     | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 7c40342f18..eefa56c824 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -322,7 +322,8 @@ def main():
     # Save a trained model
     if args.local_rank == -1 or torch.distributed.get_rank() == 0:
         logging.info("** ** * Saving fine-tuned model ** ** * ")
-        model.save_pretrained(args.output_dir)
+        model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+        model_to_save.save_pretrained(args.output_dir)
         tokenizer.save_pretrained(args.output_dir)
 
 
diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index 25333de0ed..9633640faf 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -610,7 +610,8 @@ def main():
         # Save a trained model
         if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
             logger.info("** ** * Saving fine - tuned model ** ** * ")
-            model.save_pretrained(args.output_dir)
+            model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
+            model_to_save.save_pretrained(args.output_dir)
             tokenizer.save_pretrained(args.output_dir)
 
 

From 40acf6b52a5250608c2b90edd955835131971d5a Mon Sep 17 00:00:00 2001
From: Chi-Liang Liu <liangtaiwan1230@gmail.com>
Date: Tue, 30 Jul 2019 18:37:37 +0800
Subject: [PATCH 36/45] don't save model without training

---
 examples/run_squad.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index f0ae9169ad..f2d29fd6b1 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -481,7 +481,7 @@ def main():
 
 
     # Save the trained model and the tokenizer
-    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
+    if args.do_train and args.local_rank == -1 or torch.distributed.get_rank() == 0:
         # Create output directory if needed
         if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
             os.makedirs(args.output_dir)

From c589862b783b94a8408b40c6dc9bf4a14b2ee391 Mon Sep 17 00:00:00 2001
From: Lysandre <lysandre.debut@reseau.eseo.fr>
Date: Mon, 19 Aug 2019 10:17:47 -0400
Subject: [PATCH 37/45] Doc: loading from config alone does not load the model
 weights

---
 pytorch_transformers/modeling_bert.py       | 4 +++-
 pytorch_transformers/modeling_gpt2.py       | 2 ++
 pytorch_transformers/modeling_openai.py     | 2 ++
 pytorch_transformers/modeling_roberta.py    | 3 ++-
 pytorch_transformers/modeling_transfo_xl.py | 2 ++
 pytorch_transformers/modeling_utils.py      | 4 ++++
 pytorch_transformers/modeling_xlm.py        | 2 ++
 pytorch_transformers/modeling_xlnet.py      | 2 ++
 8 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/modeling_bert.py b/pytorch_transformers/modeling_bert.py
index 51d8788545..9c20eac9bf 100644
--- a/pytorch_transformers/modeling_bert.py
+++ b/pytorch_transformers/modeling_bert.py
@@ -577,7 +577,9 @@ BERT_START_DOCSTRING = r"""    The BERT model was proposed in
         https://pytorch.org/docs/stable/nn.html#module
 
     Parameters:
-        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model.
+        config (:class:`~pytorch_transformers.BertConfig`): Model configuration class with all the parameters of the model. 
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 BERT_INPUTS_DOCSTRING = r"""
diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index 5211def3e3..f67d0e88d5 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -383,6 +383,8 @@ GPT2_START_DOCSTRING = r"""    OpenAI GPT-2 model was proposed in
 
     Parameters:
         config (:class:`~pytorch_transformers.GPT2Config`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 GPT2_INPUTS_DOCSTRING = r"""    Inputs:
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index 364923b0af..e8648487be 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -397,6 +397,8 @@ OPENAI_GPT_START_DOCSTRING = r"""    OpenAI GPT model was proposed in
 
     Parameters:
         config (:class:`~pytorch_transformers.OpenAIGPTConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 OPENAI_GPT_INPUTS_DOCSTRING = r"""    Inputs:
diff --git a/pytorch_transformers/modeling_roberta.py b/pytorch_transformers/modeling_roberta.py
index adb04b4b3a..e3065cf60b 100644
--- a/pytorch_transformers/modeling_roberta.py
+++ b/pytorch_transformers/modeling_roberta.py
@@ -90,7 +90,8 @@ ROBERTA_START_DOCSTRING = r"""    The RoBERTa model was proposed in
 
     Parameters:
         config (:class:`~pytorch_transformers.RobertaConfig`): Model configuration class with all the parameters of the 
-            model.
+            model. Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 ROBERTA_INPUTS_DOCSTRING = r"""
diff --git a/pytorch_transformers/modeling_transfo_xl.py b/pytorch_transformers/modeling_transfo_xl.py
index cb5416964c..553a71fffe 100644
--- a/pytorch_transformers/modeling_transfo_xl.py
+++ b/pytorch_transformers/modeling_transfo_xl.py
@@ -928,6 +928,8 @@ TRANSFO_XL_START_DOCSTRING = r"""    The Transformer-XL model was proposed in
 
     Parameters:
         config (:class:`~pytorch_transformers.TransfoXLConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 TRANSFO_XL_INPUTS_DOCSTRING = r"""
diff --git a/pytorch_transformers/modeling_utils.py b/pytorch_transformers/modeling_utils.py
index 35f82e324f..edc6b3903e 100644
--- a/pytorch_transformers/modeling_utils.py
+++ b/pytorch_transformers/modeling_utils.py
@@ -71,6 +71,10 @@ class PretrainedConfig(object):
     r""" Base class for all configuration classes.
         Handles a few parameters common to all models' configurations as well as methods for loading/downloading/saving configurations.
 
+        Note:
+            A configuration file can be loaded and saved to disk. Loading the configuration file and using this file to initialize a model does **not** load the model weights.
+            It only affects the model's configuration.
+
         Class attributes (overridden by derived classes):
             - ``pretrained_config_archive_map``: a python ``dict`` of with `short-cut-names` (string) as keys and `url` (string) of associated pretrained model configurations as values.
 
diff --git a/pytorch_transformers/modeling_xlm.py b/pytorch_transformers/modeling_xlm.py
index 941c8dda2f..d01d245bbb 100644
--- a/pytorch_transformers/modeling_xlm.py
+++ b/pytorch_transformers/modeling_xlm.py
@@ -416,6 +416,8 @@ XLM_START_DOCSTRING = r"""    The XLM model was proposed in
 
     Parameters:
         config (:class:`~pytorch_transformers.XLMConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 XLM_INPUTS_DOCSTRING = r"""
diff --git a/pytorch_transformers/modeling_xlnet.py b/pytorch_transformers/modeling_xlnet.py
index e9e75e3ab7..af33c5a6c2 100644
--- a/pytorch_transformers/modeling_xlnet.py
+++ b/pytorch_transformers/modeling_xlnet.py
@@ -647,6 +647,8 @@ XLNET_START_DOCSTRING = r"""    The XLNet model was proposed in
 
     Parameters:
         config (:class:`~pytorch_transformers.XLNetConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the configuration.
+            Check out the :meth:`~pytorch_transformers.PreTrainedModel.from_pretrained` method to load the model weights.
 """
 
 XLNET_INPUTS_DOCSTRING = r"""

From a368b877911862da014ed7b219679effbb8dd8ca Mon Sep 17 00:00:00 2001
From: Peng Qi <qipeng@users.noreply.github.com>
Date: Mon, 19 Aug 2019 13:07:00 -0700
Subject: [PATCH 38/45] Fix #1015

---
 examples/run_squad.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/run_squad.py b/examples/run_squad.py
index f2d29fd6b1..efa835107c 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -498,7 +498,7 @@ def main():
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         model.to(args.device)
 
 

From 28f7ca1f807f0857c24f18c0b28b6b8ebee18c0a Mon Sep 17 00:00:00 2001
From: Zeyao Du <ned1991@gmail.com>
Date: Tue, 20 Aug 2019 15:58:42 +0800
Subject: [PATCH 39/45] swap optimizer.step and scheduler.step

---
 examples/lm_finetuning/simple_lm_finetuning.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/lm_finetuning/simple_lm_finetuning.py b/examples/lm_finetuning/simple_lm_finetuning.py
index ba5f832827..dca883d2f6 100644
--- a/examples/lm_finetuning/simple_lm_finetuning.py
+++ b/examples/lm_finetuning/simple_lm_finetuning.py
@@ -602,8 +602,8 @@ def main():
                 nb_tr_examples += input_ids.size(0)
                 nb_tr_steps += 1
                 if (step + 1) % args.gradient_accumulation_steps == 0:
-                    scheduler.step()  # Update learning rate schedule
                     optimizer.step()
+                    scheduler.step()  # Update learning rate schedule
                     optimizer.zero_grad()
                     global_step += 1
 

From a1359b970cb4bfa41008a45b44dd2a25e579bff3 Mon Sep 17 00:00:00 2001
From: Zeyao Du <ned1991@gmail.com>
Date: Tue, 20 Aug 2019 16:00:07 +0800
Subject: [PATCH 40/45] Update finetune_on_pregenerated.py

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 9fcc5f2cb1..ccf1c15313 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -314,8 +314,8 @@ def main():
                 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                 pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                 if (step + 1) % args.gradient_accumulation_steps == 0:
-                    scheduler.step()  # Update learning rate schedule
                     optimizer.step()
+                    scheduler.step()  # Update learning rate schedule
                     optimizer.zero_grad()
                     global_step += 1
 

From 45ab8bf60e5c2af912006035f5568be92c0c99c9 Mon Sep 17 00:00:00 2001
From: Duzeyao <330501241@qq.com>
Date: Tue, 20 Aug 2019 16:40:39 +0800
Subject: [PATCH 41/45] Revert "Update finetune_on_pregenerated.py"

This reverts commit a1359b970cb4bfa41008a45b44dd2a25e579bff3.
---
 examples/lm_finetuning/finetune_on_pregenerated.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index ccf1c15313..9fcc5f2cb1 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -314,8 +314,8 @@ def main():
                 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                 pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                 if (step + 1) % args.gradient_accumulation_steps == 0:
-                    optimizer.step()
                     scheduler.step()  # Update learning rate schedule
+                    optimizer.step()
                     optimizer.zero_grad()
                     global_step += 1
 

From d86b49ac86141810af4a7c82ed34e789b3b1937e Mon Sep 17 00:00:00 2001
From: Duzeyao <330501241@qq.com>
Date: Tue, 20 Aug 2019 16:46:34 +0800
Subject: [PATCH 42/45] swap optimizer.step and scheduler.step

---
 examples/lm_finetuning/finetune_on_pregenerated.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/lm_finetuning/finetune_on_pregenerated.py b/examples/lm_finetuning/finetune_on_pregenerated.py
index 9fcc5f2cb1..ccf1c15313 100644
--- a/examples/lm_finetuning/finetune_on_pregenerated.py
+++ b/examples/lm_finetuning/finetune_on_pregenerated.py
@@ -314,8 +314,8 @@ def main():
                 mean_loss = tr_loss * args.gradient_accumulation_steps / nb_tr_steps
                 pbar.set_postfix_str(f"Loss: {mean_loss:.5f}")
                 if (step + 1) % args.gradient_accumulation_steps == 0:
-                    scheduler.step()  # Update learning rate schedule
                     optimizer.step()
+                    scheduler.step()  # Update learning rate schedule
                     optimizer.zero_grad()
                     global_step += 1
 

From b0b9b8091b73f929306704bd8cd62b712621cebc Mon Sep 17 00:00:00 2001
From: Julien Chaumond <chaumond@gmail.com>
Date: Tue, 20 Aug 2019 11:33:46 +0200
Subject: [PATCH 43/45] minor typo

---
 pytorch_transformers/modeling_gpt2.py   | 2 +-
 pytorch_transformers/modeling_openai.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/pytorch_transformers/modeling_gpt2.py b/pytorch_transformers/modeling_gpt2.py
index f67d0e88d5..dd3e465bf3 100644
--- a/pytorch_transformers/modeling_gpt2.py
+++ b/pytorch_transformers/modeling_gpt2.py
@@ -614,7 +614,7 @@ class GPT2LMHeadModel(GPT2PreTrainedModel):
 @add_start_docstrings("""The GPT2 Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
-the classification head takes as input the input of a specified classification token index in the intput sequence).
+the classification head takes as input the input of a specified classification token index in the input sequence).
 """, GPT2_START_DOCSTRING)
 class GPT2DoubleHeadsModel(GPT2PreTrainedModel):
     r"""    Inputs:
diff --git a/pytorch_transformers/modeling_openai.py b/pytorch_transformers/modeling_openai.py
index e8648487be..a4f02111e7 100644
--- a/pytorch_transformers/modeling_openai.py
+++ b/pytorch_transformers/modeling_openai.py
@@ -604,7 +604,7 @@ class OpenAIGPTLMHeadModel(OpenAIGPTPreTrainedModel):
 @add_start_docstrings("""OpenAI GPT Model transformer with a language modeling and a multiple-choice classification
 head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers.
 The language modeling head has its weights tied to the input embeddings,
-the classification head takes as input the input of a specified classification token index in the intput sequence).
+the classification head takes as input the input of a specified classification token index in the input sequence).
 """, OPENAI_GPT_START_DOCSTRING)
 class OpenAIGPTDoubleHeadsModel(OpenAIGPTPreTrainedModel):
     r"""    Inputs:

From ad6e62cd827d546691845aca5fb9b437c5812d6a Mon Sep 17 00:00:00 2001
From: Nikolay Korolev <korolevns98@gmail.com>
Date: Tue, 20 Aug 2019 15:43:06 +0300
Subject: [PATCH 44/45] Fix typo. configuratoin -> configuration

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 7d2445fc11..4e57de5842 100644
--- a/README.md
+++ b/README.md
@@ -328,7 +328,7 @@ Breaking change in the `from_pretrained()`method:
 
 1. Models are now set in evaluation mode by default when instantiated with the `from_pretrained()` method. To train them don't forget to set them back in training mode (`model.train()`) to activate the dropout modules.
 
-2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuratoin class attributes.
+2. The additional `*input` and `**kwargs` arguments supplied to the `from_pretrained()` method used to be directly passed to the underlying model's class `__init__()` method. They are now used to update the model configuration attribute instead which can break derived model classes build based on the previous `BertForSequenceClassification` examples. We are working on a way to mitigate this breaking change in [#866](https://github.com/huggingface/pytorch-transformers/pull/866) by forwarding the the model `__init__()` method (i) the provided positional arguments and (ii) the keyword arguments which do not match any configuration class attributes.
 
 Also, while not a breaking change, the serialization methods have been standardized and you probably should switch to the new method `save_pretrained(save_directory)` if you were using any other serialization method before.
 

From 3bffd2e8e5d726d581e0a66746b25c64d49e231d Mon Sep 17 00:00:00 2001
From: Peng Qi <qipeng@users.noreply.github.com>
Date: Tue, 20 Aug 2019 10:59:28 -0700
Subject: [PATCH 45/45] more fixes

---
 examples/run_glue.py  | 2 +-
 examples/run_squad.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/run_glue.py b/examples/run_glue.py
index 7fb0732e61..1729f4f7e3 100644
--- a/examples/run_glue.py
+++ b/examples/run_glue.py
@@ -467,7 +467,7 @@ def main():
 
         # Load a trained model and vocabulary that you have fine-tuned
         model = model_class.from_pretrained(args.output_dir)
-        tokenizer = tokenizer_class.from_pretrained(args.output_dir)
+        tokenizer = tokenizer_class.from_pretrained(args.output_dir, do_lower_case=args.do_lower_case)
         model.to(args.device)
 
 
diff --git a/examples/run_squad.py b/examples/run_squad.py
index efa835107c..c0586b03bd 100644
--- a/examples/run_squad.py
+++ b/examples/run_squad.py
@@ -481,7 +481,7 @@ def main():
 
 
     # Save the trained model and the tokenizer
-    if args.do_train and args.local_rank == -1 or torch.distributed.get_rank() == 0:
+    if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
         # Create output directory if needed
         if not os.path.exists(args.output_dir) and args.local_rank in [-1, 0]:
             os.makedirs(args.output_dir)