Reorganize repo (#8580)

* Put models in subfolders * Styling * Fix imports in tests * More fixes in test imports * Sneaky hidden imports * Fix imports in doc files * More sneaky imports * Finish fixing tests * Fix examples * Fix path for copies * More fixes for examples * Fix dummy files * More fixes for example * More model import fixes * Is this why you're unhappy GitHub? * Fix imports in conver command
2020-11-16 21:43:42 -05:00
parent 901507335f
commit c89bdfbe72
381 changed files with 2651 additions and 1571 deletions
--- a/src/transformers/models/roberta/init.py
+++ b/src/transformers/models/roberta/init.py
@@ -0,0 +1,39 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+from ...file_utils import is_flax_available, is_tf_available, is_tokenizers_available, is_torch_available
+from .configuration_roberta import ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP, RobertaConfig
+from .tokenization_roberta import RobertaTokenizer
+
+
+if is_tokenizers_available():
+    from .tokenization_roberta_fast import RobertaTokenizerFast
+
+if is_torch_available():
+    from .modeling_roberta import (
+        ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
+        RobertaForCausalLM,
+        RobertaForMaskedLM,
+        RobertaForMultipleChoice,
+        RobertaForQuestionAnswering,
+        RobertaForSequenceClassification,
+        RobertaForTokenClassification,
+        RobertaModel,
+    )
+
+if is_tf_available():
+    from .modeling_tf_roberta import (
+        TF_ROBERTA_PRETRAINED_MODEL_ARCHIVE_LIST,
+        TFRobertaForMaskedLM,
+        TFRobertaForMultipleChoice,
+        TFRobertaForQuestionAnswering,
+        TFRobertaForSequenceClassification,
+        TFRobertaForTokenClassification,
+        TFRobertaMainLayer,
+        TFRobertaModel,
+        TFRobertaPreTrainedModel,
+    )
+
+if is_flax_available():
+    from .modeling_flax_roberta import FlaxRobertaModel
--- a/src/transformers/models/roberta/configuration_roberta.py
+++ b/src/transformers/models/roberta/configuration_roberta.py
@@ -0,0 +1,64 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" RoBERTa configuration """
+
+from ...utils import logging
+from ..bert.configuration_bert import BertConfig
+
+
+logger = logging.get_logger(__name__)
+
+ROBERTA_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "roberta-base": "https://huggingface.co/roberta-base/resolve/main/config.json",
+    "roberta-large": "https://huggingface.co/roberta-large/resolve/main/config.json",
+    "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/config.json",
+    "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/config.json",
+    "roberta-base-openai-detector": "https://huggingface.co/roberta-base-openai-detector/resolve/main/config.json",
+    "roberta-large-openai-detector": "https://huggingface.co/roberta-large-openai-detector/resolve/main/config.json",
+}
+
+
+class RobertaConfig(BertConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.RobertaModel` or a
+    :class:`~transformers.TFRobertaModel`. It is used to instantiate a RoBERTa model according to the specified
+    arguments, defining the model architecture.
+
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+    The :class:`~transformers.RobertaConfig` class directly inherits :class:`~transformers.BertConfig`. It reuses the
+    same defaults. Please check the parent class for more information.
+
+    Examples::
+
+        >>> from transformers import RobertaConfig, RobertaModel
+
+        >>> # Initializing a RoBERTa configuration
+        >>> configuration = RobertaConfig()
+
+        >>> # Initializing a model from the configuration
+        >>> model = RobertaModel(configuration)
+
+        >>> # Accessing the model configuration
+        >>> configuration = model.config
+    """
+    model_type = "roberta"
+
+    def __init__(self, pad_token_id=1, bos_token_id=0, eos_token_id=2, **kwargs):
+        """Constructs RobertaConfig."""
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
--- a/src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
+++ b/src/transformers/models/roberta/convert_roberta_original_pytorch_checkpoint_to_pytorch.py
@@ -0,0 +1,182 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Convert RoBERTa checkpoint."""
+
+
+import argparse
+import pathlib
+
+import fairseq
+import torch
+from fairseq.models.roberta import RobertaModel as FairseqRobertaModel
+from fairseq.modules import TransformerSentenceEncoderLayer
+from packaging import version
+
+from transformers.models.bertmodeling_bert import (
+    BertIntermediate,
+    BertLayer,
+    BertOutput,
+    BertSelfAttention,
+    BertSelfOutput,
+)
+from transformers.models.roberta.modeling_roberta import (
+    RobertaConfig,
+    RobertaForMaskedLM,
+    RobertaForSequenceClassification,
+)
+from transformers.utils import logging
+
+
+if version.parse(fairseq.__version__) < version.parse("0.9.0"):
+    raise Exception("requires fairseq >= 0.9.0")
+
+
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+
+SAMPLE_TEXT = "Hello world! cécé herlolip"
+
+
+def convert_roberta_checkpoint_to_pytorch(
+    roberta_checkpoint_path: str, pytorch_dump_folder_path: str, classification_head: bool
+):
+    """
+    Copy/paste/tweak roberta's weights to our BERT structure.
+    """
+    roberta = FairseqRobertaModel.from_pretrained(roberta_checkpoint_path)
+    roberta.eval()  # disable dropout
+    roberta_sent_encoder = roberta.model.encoder.sentence_encoder
+    config = RobertaConfig(
+        vocab_size=roberta_sent_encoder.embed_tokens.num_embeddings,
+        hidden_size=roberta.args.encoder_embed_dim,
+        num_hidden_layers=roberta.args.encoder_layers,
+        num_attention_heads=roberta.args.encoder_attention_heads,
+        intermediate_size=roberta.args.encoder_ffn_embed_dim,
+        max_position_embeddings=514,
+        type_vocab_size=1,
+        layer_norm_eps=1e-5,  # PyTorch default used in fairseq
+    )
+    if classification_head:
+        config.num_labels = roberta.model.classification_heads["mnli"].out_proj.weight.shape[0]
+    print("Our BERT config:", config)
+
+    model = RobertaForSequenceClassification(config) if classification_head else RobertaForMaskedLM(config)
+    model.eval()
+
+    # Now let's copy all the weights.
+    # Embeddings
+    model.roberta.embeddings.word_embeddings.weight = roberta_sent_encoder.embed_tokens.weight
+    model.roberta.embeddings.position_embeddings.weight = roberta_sent_encoder.embed_positions.weight
+    model.roberta.embeddings.token_type_embeddings.weight.data = torch.zeros_like(
+        model.roberta.embeddings.token_type_embeddings.weight
+    )  # just zero them out b/c RoBERTa doesn't use them.
+    model.roberta.embeddings.LayerNorm.weight = roberta_sent_encoder.emb_layer_norm.weight
+    model.roberta.embeddings.LayerNorm.bias = roberta_sent_encoder.emb_layer_norm.bias
+
+    for i in range(config.num_hidden_layers):
+        # Encoder: start of layer
+        layer: BertLayer = model.roberta.encoder.layer[i]
+        roberta_layer: TransformerSentenceEncoderLayer = roberta_sent_encoder.layers[i]
+
+        # self attention
+        self_attn: BertSelfAttention = layer.attention.self
+        assert (
+            roberta_layer.self_attn.k_proj.weight.data.shape
+            == roberta_layer.self_attn.q_proj.weight.data.shape
+            == roberta_layer.self_attn.v_proj.weight.data.shape
+            == torch.Size((config.hidden_size, config.hidden_size))
+        )
+
+        self_attn.query.weight.data = roberta_layer.self_attn.q_proj.weight
+        self_attn.query.bias.data = roberta_layer.self_attn.q_proj.bias
+        self_attn.key.weight.data = roberta_layer.self_attn.k_proj.weight
+        self_attn.key.bias.data = roberta_layer.self_attn.k_proj.bias
+        self_attn.value.weight.data = roberta_layer.self_attn.v_proj.weight
+        self_attn.value.bias.data = roberta_layer.self_attn.v_proj.bias
+
+        # self-attention output
+        self_output: BertSelfOutput = layer.attention.output
+        assert self_output.dense.weight.shape == roberta_layer.self_attn.out_proj.weight.shape
+        self_output.dense.weight = roberta_layer.self_attn.out_proj.weight
+        self_output.dense.bias = roberta_layer.self_attn.out_proj.bias
+        self_output.LayerNorm.weight = roberta_layer.self_attn_layer_norm.weight
+        self_output.LayerNorm.bias = roberta_layer.self_attn_layer_norm.bias
+
+        # intermediate
+        intermediate: BertIntermediate = layer.intermediate
+        assert intermediate.dense.weight.shape == roberta_layer.fc1.weight.shape
+        intermediate.dense.weight = roberta_layer.fc1.weight
+        intermediate.dense.bias = roberta_layer.fc1.bias
+
+        # output
+        bert_output: BertOutput = layer.output
+        assert bert_output.dense.weight.shape == roberta_layer.fc2.weight.shape
+        bert_output.dense.weight = roberta_layer.fc2.weight
+        bert_output.dense.bias = roberta_layer.fc2.bias
+        bert_output.LayerNorm.weight = roberta_layer.final_layer_norm.weight
+        bert_output.LayerNorm.bias = roberta_layer.final_layer_norm.bias
+        # end of layer
+
+    if classification_head:
+        model.classifier.dense.weight = roberta.model.classification_heads["mnli"].dense.weight
+        model.classifier.dense.bias = roberta.model.classification_heads["mnli"].dense.bias
+        model.classifier.out_proj.weight = roberta.model.classification_heads["mnli"].out_proj.weight
+        model.classifier.out_proj.bias = roberta.model.classification_heads["mnli"].out_proj.bias
+    else:
+        # LM Head
+        model.lm_head.dense.weight = roberta.model.encoder.lm_head.dense.weight
+        model.lm_head.dense.bias = roberta.model.encoder.lm_head.dense.bias
+        model.lm_head.layer_norm.weight = roberta.model.encoder.lm_head.layer_norm.weight
+        model.lm_head.layer_norm.bias = roberta.model.encoder.lm_head.layer_norm.bias
+        model.lm_head.decoder.weight = roberta.model.encoder.lm_head.weight
+        model.lm_head.decoder.bias = roberta.model.encoder.lm_head.bias
+
+    # Let's check that we get the same results.
+    input_ids: torch.Tensor = roberta.encode(SAMPLE_TEXT).unsqueeze(0)  # batch of size 1
+
+    our_output = model(input_ids)[0]
+    if classification_head:
+        their_output = roberta.model.classification_heads["mnli"](roberta.extract_features(input_ids))
+    else:
+        their_output = roberta.model(input_ids)[0]
+    print(our_output.shape, their_output.shape)
+    max_absolute_diff = torch.max(torch.abs(our_output - their_output)).item()
+    print(f"max_absolute_diff = {max_absolute_diff}")  # ~ 1e-7
+    success = torch.allclose(our_output, their_output, atol=1e-3)
+    print("Do both models output the same tensors?", "🔥" if success else "💩")
+    if not success:
+        raise Exception("Something went wRoNg")
+
+    pathlib.Path(pytorch_dump_folder_path).mkdir(parents=True, exist_ok=True)
+    print(f"Saving model to {pytorch_dump_folder_path}")
+    model.save_pretrained(pytorch_dump_folder_path)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument(
+        "--roberta_checkpoint_path", default=None, type=str, required=True, help="Path the official PyTorch dump."
+    )
+    parser.add_argument(
+        "--pytorch_dump_folder_path", default=None, type=str, required=True, help="Path to the output PyTorch model."
+    )
+    parser.add_argument(
+        "--classification_head", action="store_true", help="Whether to convert a final classification head."
+    )
+    args = parser.parse_args()
+    convert_roberta_checkpoint_to_pytorch(
+        args.roberta_checkpoint_path, args.pytorch_dump_folder_path, args.classification_head
+    )
--- a/src/transformers/models/roberta/modeling_flax_roberta.py
+++ b/src/transformers/models/roberta/modeling_flax_roberta.py
@@ -0,0 +1,432 @@
+# coding=utf-8
+# Copyright 2018 The Google Flax Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Dict
+
+import numpy as np
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+
+from ...file_utils import add_start_docstrings, add_start_docstrings_to_model_forward
+from ...modeling_flax_utils import FlaxPreTrainedModel, gelu
+from ...utils import logging
+from .configuration_roberta import RobertaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "RobertaConfig"
+_TOKENIZER_FOR_DOC = "RobertaTokenizer"
+
+
+ROBERTA_START_DOCSTRING = r"""
+
+    This model inherits from :class:`~transformers.FlaxPreTrainedModel`. Check the superclass documentation for the
+    generic methods the library implements for all its model (such as downloading, saving and converting weights from
+    PyTorch models)
+
+    This model is also a Flax Linen `flax.nn.Module
+    <https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html>`__ subclass. Use it as a regular Flax
+    Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - `Just-In-Time (JIT) compilation <https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit>`__
+    - `Automatic Differentiation <https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation>`__
+    - `Vectorization <https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap>`__
+    - `Parallelization <https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap>`__
+
+    Parameters:
+        config (:class:`~transformers.RobertaConfig`): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+            weights.
+"""
+
+ROBERTA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using :class:`~transformers.BertTokenizer`. See
+            :func:`transformers.PreTrainedTokenizer.encode` and :func:`transformers.PreTrainedTokenizer.__call__` for
+            details.
+
+            `What are input IDs? <../glossary.html#input-ids>`__
+        attention_mask (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
+            Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            `What are attention masks? <../glossary.html#attention-mask>`__
+        token_type_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in ``[0,
+            1]``:
+
+            - 0 corresponds to a `sentence A` token,
+            - 1 corresponds to a `sentence B` token.
+
+            `What are token type IDs? <../glossary.html#token-type-ids>`__
+        position_ids (:obj:`numpy.ndarray` of shape :obj:`({0})`, `optional`):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range ``[0,
+            config.max_position_embeddings - 1]``.
+        return_dict (:obj:`bool`, `optional`):
+            Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerNorm with Bert->Roberta
+class FlaxRobertaLayerNorm(nn.Module):
+    """
+    Layer normalization (https://arxiv.org/abs/1607.06450). Operates on the last axis of the input data.
+    """
+
+    epsilon: float = 1e-6
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    bias: bool = True  # If True, bias (beta) is added.
+    scale: bool = True  # If True, multiply by scale (gamma). When the next layer is linear
+    # (also e.g. nn.relu), this can be disabled since the scaling will be
+    # done by the next layer.
+    bias_init: jnp.ndarray = nn.initializers.zeros
+    scale_init: jnp.ndarray = nn.initializers.ones
+
+    @nn.compact
+    def __call__(self, x):
+        """
+        Applies layer normalization on the input. It normalizes the activations of the layer for each given example in
+        a batch independently, rather than across a batch like Batch Normalization. i.e. applies a transformation that
+        maintains the mean activation within each example close to 0 and the activation standard deviation close to 1
+
+        Args:
+          x: the inputs
+
+        Returns:
+          Normalized inputs (the same shape as inputs).
+        """
+        features = x.shape[-1]
+        mean = jnp.mean(x, axis=-1, keepdims=True)
+        mean2 = jnp.mean(jax.lax.square(x), axis=-1, keepdims=True)
+        var = mean2 - jax.lax.square(mean)
+        mul = jax.lax.rsqrt(var + self.epsilon)
+        if self.scale:
+            mul = mul * jnp.asarray(self.param("gamma", self.scale_init, (features,)), self.dtype)
+        y = (x - mean) * mul
+        if self.bias:
+            y = y + jnp.asarray(self.param("beta", self.bias_init, (features,)), self.dtype)
+        return y
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbedding with Bert->Roberta
+class FlaxRobertaEmbedding(nn.Module):
+    """
+    Specify a new class for doing the embedding stuff as Flax's one use 'embedding' for the parameter name and PyTorch
+    use 'weight'
+    """
+
+    vocab_size: int
+    hidden_size: int
+    emb_init: Callable[..., np.ndarray] = nn.initializers.normal(stddev=0.1)
+
+    @nn.compact
+    def __call__(self, inputs):
+        embedding = self.param("weight", self.emb_init, (self.vocab_size, self.hidden_size))
+        return jnp.take(embedding, inputs, axis=0)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings with Bert->Roberta
+class FlaxRobertaEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    vocab_size: int
+    hidden_size: int
+    type_vocab_size: int
+    max_length: int
+
+    @nn.compact
+    def __call__(self, input_ids, token_type_ids, position_ids, attention_mask):
+
+        # Embed
+        w_emb = FlaxRobertaEmbedding(self.vocab_size, self.hidden_size, name="word_embeddings")(
+            jnp.atleast_2d(input_ids.astype("i4"))
+        )
+        p_emb = FlaxRobertaEmbedding(self.max_length, self.hidden_size, name="position_embeddings")(
+            jnp.atleast_2d(position_ids.astype("i4"))
+        )
+        t_emb = FlaxRobertaEmbedding(self.type_vocab_size, self.hidden_size, name="token_type_embeddings")(
+            jnp.atleast_2d(token_type_ids.astype("i4"))
+        )
+
+        # Sum all embeddings
+        summed_emb = w_emb + jnp.broadcast_to(p_emb, w_emb.shape) + t_emb
+
+        # Layer Norm
+        layer_norm = FlaxRobertaLayerNorm(name="layer_norm")(summed_emb)
+
+        return layer_norm
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertAttention with Bert->Roberta
+class FlaxRobertaAttention(nn.Module):
+    num_heads: int
+    head_size: int
+
+    @nn.compact
+    def __call__(self, hidden_state, attention_mask):
+        self_att = nn.attention.SelfAttention(num_heads=self.num_heads, qkv_features=self.head_size, name="self")(
+            hidden_state, attention_mask
+        )
+
+        layer_norm = FlaxRobertaLayerNorm(name="layer_norm")(self_att + hidden_state)
+        return layer_norm
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->Roberta
+class FlaxRobertaIntermediate(nn.Module):
+    output_size: int
+
+    @nn.compact
+    def __call__(self, hidden_state):
+        # TODO: Add ACT2FN reference to change activation function
+        dense = nn.Dense(features=self.output_size, name="dense")(hidden_state)
+        return gelu(dense)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOutput with Bert->Roberta
+class FlaxRobertaOutput(nn.Module):
+    @nn.compact
+    def __call__(self, intermediate_output, attention_output):
+        hidden_state = nn.Dense(attention_output.shape[-1], name="dense")(intermediate_output)
+        hidden_state = FlaxRobertaLayerNorm(name="layer_norm")(hidden_state + attention_output)
+        return hidden_state
+
+
+class FlaxRobertaLayer(nn.Module):
+    num_heads: int
+    head_size: int
+    intermediate_size: int
+
+    @nn.compact
+    def __call__(self, hidden_state, attention_mask):
+        attention = FlaxRobertaAttention(self.num_heads, self.head_size, name="attention")(
+            hidden_state, attention_mask
+        )
+        intermediate = FlaxRobertaIntermediate(self.intermediate_size, name="intermediate")(attention)
+        output = FlaxRobertaOutput(name="output")(intermediate, attention)
+
+        return output
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection with Bert->Roberta
+class FlaxRobertaLayerCollection(nn.Module):
+    """
+    Stores N RobertaLayer(s)
+    """
+
+    num_layers: int
+    num_heads: int
+    head_size: int
+    intermediate_size: int
+
+    @nn.compact
+    def __call__(self, inputs, attention_mask):
+        assert self.num_layers > 0, f"num_layers should be >= 1, got ({self.num_layers})"
+
+        # Initialize input / output
+        input_i = inputs
+
+        # Forward over all encoders
+        for i in range(self.num_layers):
+            layer = FlaxRobertaLayer(self.num_heads, self.head_size, self.intermediate_size, name=f"{i}")
+            input_i = layer(input_i, attention_mask)
+        return input_i
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->Roberta
+class FlaxRobertaEncoder(nn.Module):
+    num_layers: int
+    num_heads: int
+    head_size: int
+    intermediate_size: int
+
+    @nn.compact
+    def __call__(self, hidden_state, attention_mask):
+        layer = FlaxRobertaLayerCollection(
+            self.num_layers, self.num_heads, self.head_size, self.intermediate_size, name="layer"
+        )(hidden_state, attention_mask)
+        return layer
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPooler with Bert->Roberta
+class FlaxRobertaPooler(nn.Module):
+    @nn.compact
+    def __call__(self, hidden_state):
+        cls_token = hidden_state[:, 0]
+        out = nn.Dense(hidden_state.shape[-1], name="dense")(cls_token)
+        return jax.lax.tanh(out)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertModule with Bert->Roberta
+class FlaxRobertaModule(nn.Module):
+    vocab_size: int
+    hidden_size: int
+    type_vocab_size: int
+    max_length: int
+    num_encoder_layers: int
+    num_heads: int
+    head_size: int
+    intermediate_size: int
+
+    @nn.compact
+    def __call__(self, input_ids, attention_mask, token_type_ids, position_ids):
+
+        # Embedding
+        embeddings = FlaxRobertaEmbeddings(
+            self.vocab_size, self.hidden_size, self.type_vocab_size, self.max_length, name="embeddings"
+        )(input_ids, token_type_ids, position_ids, attention_mask)
+
+        # N stacked encoding layers
+        encoder = FlaxRobertaEncoder(
+            self.num_encoder_layers, self.num_heads, self.head_size, self.intermediate_size, name="encoder"
+        )(embeddings, attention_mask)
+
+        pooled = FlaxRobertaPooler(name="pooler")(encoder)
+        return encoder, pooled
+
+
+@add_start_docstrings(
+    "The bare RoBERTa Model transformer outputting raw hidden-states without any specific head on top.",
+    ROBERTA_START_DOCSTRING,
+)
+class FlaxRobertaModel(FlaxPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in `Attention is
+    all you need`_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
+    Kaiser and Illia Polosukhin.
+    """
+
+    model_class = FlaxRobertaModule
+    config_class = RobertaConfig
+    base_model_prefix = "roberta"
+
+    @staticmethod
+    def convert_from_pytorch(pt_state: Dict, config: RobertaConfig) -> Dict:
+        jax_state = dict(pt_state)
+
+        # Need to change some parameters name to match Flax names so that we don't have to fork any layer
+        for key, tensor in pt_state.items():
+            # Key parts
+            key_parts = set(key.split("."))
+
+            # Every dense layer has "kernel" parameters instead of "weight"
+            if "dense.weight" in key:
+                del jax_state[key]
+                key = key.replace("weight", "kernel")
+                jax_state[key] = tensor
+
+            # SelfAttention needs also to replace "weight" by "kernel"
+            if {"query", "key", "value"} & key_parts:
+
+                # Flax SelfAttention decomposes the heads (num_head, size // num_heads)
+                if "bias" in key:
+                    jax_state[key] = tensor.reshape((config.num_attention_heads, -1))
+                elif "weight":
+                    del jax_state[key]
+                    key = key.replace("weight", "kernel")
+                    tensor = tensor.reshape((config.num_attention_heads, -1, config.hidden_size)).transpose((2, 0, 1))
+                    jax_state[key] = tensor
+
+            # SelfAttention output is not a separate layer, remove one nesting
+            if "attention.output.dense" in key:
+                del jax_state[key]
+                key = key.replace("attention.output.dense", "attention.self.out")
+                jax_state[key] = tensor
+
+            # SelfAttention output is not a separate layer, remove nesting on layer norm
+            if "attention.output.LayerNorm" in key:
+                del jax_state[key]
+                key = key.replace("attention.output.LayerNorm", "attention.LayerNorm")
+                jax_state[key] = tensor
+
+            # There are some transposed parameters w.r.t their PyTorch counterpart
+            if "intermediate.dense.kernel" in key or "output.dense.kernel" in key:
+                jax_state[key] = tensor.T
+
+            # Self Attention output projection needs to be transposed
+            if "out.kernel" in key:
+                jax_state[key] = tensor.reshape((config.hidden_size, config.num_attention_heads, -1)).transpose(
+                    1, 2, 0
+                )
+
+            # Pooler needs to transpose its kernel
+            if "pooler.dense.kernel" in key:
+                jax_state[key] = tensor.T
+
+            # Handle LayerNorm conversion
+            if "LayerNorm" in key:
+                del jax_state[key]
+
+                # Replace LayerNorm by layer_norm
+                new_key = key.replace("LayerNorm", "layer_norm")
+
+                if "weight" in key:
+                    new_key = new_key.replace("weight", "gamma")
+                elif "bias" in key:
+                    new_key = new_key.replace("bias", "beta")
+
+                jax_state[new_key] = tensor
+
+        return jax_state
+
+    def __init__(self, config: RobertaConfig, state: dict, seed: int = 0, **kwargs):
+        model = FlaxRobertaModule(
+            vocab_size=config.vocab_size,
+            hidden_size=config.hidden_size,
+            type_vocab_size=config.type_vocab_size,
+            max_length=config.max_position_embeddings,
+            num_encoder_layers=config.num_hidden_layers,
+            num_heads=config.num_attention_heads,
+            head_size=config.hidden_size,
+            intermediate_size=config.intermediate_size,
+        )
+
+        super().__init__(config, model, state, seed)
+
+    @property
+    def module(self) -> nn.Module:
+        return self._module
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(self, input_ids, token_type_ids=None, attention_mask=None, position_ids=None):
+        if token_type_ids is None:
+            token_type_ids = jnp.ones_like(input_ids)
+
+        if position_ids is None:
+            position_ids = np.arange(
+                self.config.pad_token_id + 1, np.atleast_2d(input_ids).shape[-1] + self.config.pad_token_id + 1
+            )
+
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+
+        return self.model.apply(
+            {"params": self.params},
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            jnp.array(token_type_ids, dtype="i4"),
+            jnp.array(position_ids, dtype="i4"),
+        )
--- a/src/transformers/models/roberta/modeling_roberta.py
+++ b/src/transformers/models/roberta/modeling_roberta.py
--- a/src/transformers/models/roberta/modeling_tf_roberta.py
+++ b/src/transformers/models/roberta/modeling_tf_roberta.py
--- a/src/transformers/models/roberta/tokenization_roberta.py
+++ b/src/transformers/models/roberta/tokenization_roberta.py
@@ -0,0 +1,264 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RoBERTa."""
+
+import warnings
+from typing import List, Optional
+
+from ...tokenization_utils import AddedToken
+from ...utils import logging
+from ..gpt2.tokenization_gpt2 import GPT2Tokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
+        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
+        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
+        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json",
+        "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
+        "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
+        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
+        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
+        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt",
+        "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
+        "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "roberta-base": 512,
+    "roberta-large": 512,
+    "roberta-large-mnli": 512,
+    "distilroberta-base": 512,
+    "roberta-base-openai-detector": 512,
+    "roberta-large-openai-detector": 512,
+}
+
+
+class RobertaTokenizer(GPT2Tokenizer):
+    """
+    Constructs a RoBERTa tokenizer, derived from the GPT-2 tokenizer, using byte-level Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ::
+
+        >>> from transformers import RobertaTokenizer
+        >>> tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
+        >>> tokenizer("Hello world")['input_ids']
+        [0, 31414, 232, 328, 2]
+        >>> tokenizer(" Hello world")['input_ids']
+        [0, 20920, 232, 2]
+
+    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    .. note::
+
+        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first
+        one).
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
+            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
+        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
+                sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end of
+                sequence. The token used is the :obj:`sep_token`.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        add_prefix_space=False,
+        **kwargs
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A RoBERTa sequence has the following format:
+
+        - single sequence: ``<s> X </s>``
+        - pair of sequences: ``<s> A </s></s> B </s>``
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        if "is_pretokenized" in kwargs:
+            warnings.warn(
+                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
+                FutureWarning,
+            )
+            is_split_into_words = kwargs.pop("is_pretokenized")
+
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
+            text = " " + text
+        return (text, kwargs)
--- a/src/transformers/models/roberta/tokenization_roberta_fast.py
+++ b/src/transformers/models/roberta/tokenization_roberta_fast.py
@@ -0,0 +1,230 @@
+# coding=utf-8
+# Copyright 2018 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization classes for RoBERTa."""
+
+from typing import List, Optional
+
+from ...tokenization_utils_base import AddedToken
+from ...utils import logging
+from ..gpt2.tokenization_gpt2_fast import GPT2TokenizerFast
+from .tokenization_roberta import RobertaTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
+        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
+        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/vocab.json",
+        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/vocab.json",
+        "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/vocab.json",
+        "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/vocab.json",
+    },
+    "merges_file": {
+        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
+        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
+        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/merges.txt",
+        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/merges.txt",
+        "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/merges.txt",
+        "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/merges.txt",
+    },
+    "tokenizer_file": {
+        "roberta-base": "https://huggingface.co/roberta-base/resolve/main/tokenizer.json",
+        "roberta-large": "https://huggingface.co/roberta-large/resolve/main/tokenizer.json",
+        "roberta-large-mnli": "https://huggingface.co/roberta-large-mnli/resolve/main/tokenizer.json",
+        "distilroberta-base": "https://huggingface.co/distilroberta-base/resolve/main/tokenizer.json",
+        "roberta-base-openai-detector": "https://huggingface.co/roberta-base/resolve/main/tokenizer.json",
+        "roberta-large-openai-detector": "https://huggingface.co/roberta-large/resolve/main/tokenizer.json",
+    },
+}
+
+PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
+    "roberta-base": 512,
+    "roberta-large": 512,
+    "roberta-large-mnli": 512,
+    "distilroberta-base": 512,
+    "roberta-base-openai-detector": 512,
+    "roberta-large-openai-detector": 512,
+}
+
+
+class RobertaTokenizerFast(GPT2TokenizerFast):
+    """
+    Construct a "fast" RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library), derived from the GPT-2
+    tokenizer, using byte-level Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ::
+
+        >>> from transformers import RobertaTokenizerFast
+        >>> tokenizer = RobertaTokenizerFast.from_pretrained("roberta-base")
+        >>> tokenizer("Hello world")['input_ids']
+        [0, 31414, 232, 328, 2]
+        >>> tokenizer(" Hello world")['input_ids']
+        [0, 20920, 232, 2]
+
+    You can get around that behavior by passing ``add_prefix_space=True`` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    .. note::
+
+        When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
+        ``add_prefix_space=True``.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
+    methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        merges_file (:obj:`str`):
+            Path to the merges file.
+        errors (:obj:`str`, `optional`, defaults to :obj:`"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See `bytes.decode
+            <https://docs.python.org/3/library/stdtypes.html#bytes.decode>`__ for more information.
+        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the beginning of
+                sequence. The token used is the :obj:`cls_token`.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end of
+                sequence. The token used is the :obj:`sep_token`.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        add_prefix_space (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (RoBERTa tokenizer detect beginning of words by the preceding space).
+        trim_offsets (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether the post processing step should trim offsets to avoid including whitespaces.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]
+    slow_tokenizer_class = RobertaTokenizer
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        tokenizer_file=None,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        add_prefix_space=False,
+        **kwargs
+    ):
+        super().__init__(
+            vocab_file,
+            merges_file,
+            tokenizer_file=tokenizer_file,
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
+    @property
+    def mask_token(self) -> str:
+        """
+        :obj:`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while
+        not having been set.
+
+        Roberta tokenizer has a special mask token to be usble in the fill-mask pipeline. The mask token will greedily
+        comprise the space before the `<mask>`.
+        """
+        if self._mask_token is None and self.verbose:
+            logger.error("Using mask_token, but it is not set yet.")
+            return None
+        return str(self._mask_token)
+
+    @mask_token.setter
+    def mask_token(self, value):
+        """
+        Overriding the default behavior of the mask token to have it eat the space before it.
+
+        This is needed to preserve backward compatibility with all the previously used models based on Roberta.
+        """
+        # Mask token behave like a normal word, i.e. include the space before it
+        # So we set lstrip to True
+        value = AddedToken(value, lstrip=True, rstrip=False) if isinstance(value, str) else value
+        self._mask_token = value
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        output = [self.bos_token_id] + token_ids_0 + [self.eos_token_id]
+        if token_ids_1 is None:
+            return output
+
+        return output + [self.eos_token_id] + token_ids_1 + [self.eos_token_id]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. RoBERTa does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]