Add m2m100 (#10236)

* m2m_100 * no layernorm_embedding * sinusoidal positional embeddings * update pos embeddings * add default config values * tokenizer * add conversion script * fix config * fix pos embed * remove _float_tensor * update tokenizer * update lang codes * handle lang codes * fix pos embeds * fix spm key * put embedding weights on device * remove qa and seq classification heads * fix convert script * lang codes pn one line * fix embeds * fix tokenizer * fix tokenizer * add fast tokenizer * style * M2M100MT => M2M100 * fix copyright, style * tokenizer converter * vocab file * remove fast tokenizer * fix embeds * fix tokenizer * fix tests * add tokenizer tests * add integration test * quality * fix model name * fix test * doc * doc * fix doc * add copied from statements * fix tokenizer tests * apply review suggestions * fix urls * fix shift_tokens_right * apply review suggestions * fix * fix doc * add lang code to id * remove unused function * update checkpoint names * fix copy * fix tokenizer * fix checkpoint names * fix merge issue * style
2021-03-06 22:14:16 +05:30
parent fd01104435
commit f6e74a63ca
17 changed files with 2699 additions and 18 deletions
--- a/src/transformers/models/init.py
+++ b/src/transformers/models/init.py
@@ -45,6 +45,7 @@ from . import (
    led,
    longformer,
    lxmert,
+    m2m_100,
    marian,
    mbart,
    mmbt,
--- a/src/transformers/models/auto/configuration_auto.py
+++ b/src/transformers/models/auto/configuration_auto.py
@@ -45,6 +45,7 @@ from ..layoutlm.configuration_layoutlm import LAYOUTLM_PRETRAINED_CONFIG_ARCHIVE
 from ..led.configuration_led import LED_PRETRAINED_CONFIG_ARCHIVE_MAP, LEDConfig
 from ..longformer.configuration_longformer import LONGFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP, LongformerConfig
 from ..lxmert.configuration_lxmert import LXMERT_PRETRAINED_CONFIG_ARCHIVE_MAP, LxmertConfig
+from ..m2m_100.configuration_m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config
 from ..marian.configuration_marian import MarianConfig
 from ..mbart.configuration_mbart import MBART_PRETRAINED_CONFIG_ARCHIVE_MAP, MBartConfig
 from ..mobilebert.configuration_mobilebert import MobileBertConfig
@@ -76,6 +77,7 @@ ALL_PRETRAINED_CONFIG_ARCHIVE_MAP = dict(
    for pretrained_map in [
        # Add archive maps here
        WAV_2_VEC_2_PRETRAINED_CONFIG_ARCHIVE_MAP,
+        M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP,
        CONVBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
        LED_PRETRAINED_CONFIG_ARCHIVE_MAP,
        BLENDERBOT_SMALL_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -121,6 +123,7 @@ CONFIG_MAPPING = OrderedDict(
    [
        # Add configs here
        ("wav2vec2", Wav2Vec2Config),
+        ("m2m_100", M2M100Config),
        ("convbert", ConvBertConfig),
        ("led", LEDConfig),
        ("blenderbot-small", BlenderbotSmallConfig),
@@ -172,6 +175,7 @@ MODEL_NAMES_MAPPING = OrderedDict(
    [
        # Add full (and cased) model names here
        ("wav2vec2", "Wav2Vec2"),
+        ("m2m_100", "M2M100"),
        ("convbert", "ConvBERT"),
        ("led", "LED"),
        ("blenderbot-small", "BlenderbotSmall"),
--- a/src/transformers/models/auto/modeling_auto.py
+++ b/src/transformers/models/auto/modeling_auto.py
@@ -158,6 +158,7 @@ from ..longformer.modeling_longformer import (
    LongformerModel,
 )
 from ..lxmert.modeling_lxmert import LxmertForPreTraining, LxmertForQuestionAnswering, LxmertModel
+from ..m2m_100.modeling_m2m_100 import M2M100ForConditionalGeneration, M2M100Model
 from ..marian.modeling_marian import MarianForCausalLM, MarianModel, MarianMTModel
 from ..mbart.modeling_mbart import (
    MBartForCausalLM,
@@ -283,6 +284,7 @@ from .configuration_auto import (
    LEDConfig,
    LongformerConfig,
    LxmertConfig,
+    M2M100Config,
    MarianConfig,
    MBartConfig,
    MobileBertConfig,
@@ -314,6 +316,7 @@ MODEL_MAPPING = OrderedDict(
    [
        # Base model mapping
        (Wav2Vec2Config, Wav2Vec2Model),
+        (M2M100Config, M2M100Model),
        (ConvBertConfig, ConvBertModel),
        (LEDConfig, LEDModel),
        (BlenderbotSmallConfig, BlenderbotSmallModel),
@@ -397,6 +400,7 @@ MODEL_WITH_LM_HEAD_MAPPING = OrderedDict(
    [
        # Model with LM heads mapping
        (Wav2Vec2Config, Wav2Vec2ForMaskedLM),
+        (M2M100Config, M2M100ForConditionalGeneration),
        (ConvBertConfig, ConvBertForMaskedLM),
        (LEDConfig, LEDForConditionalGeneration),
        (BlenderbotSmallConfig, BlenderbotSmallForConditionalGeneration),
@@ -495,6 +499,7 @@ MODEL_FOR_MASKED_LM_MAPPING = OrderedDict(
 MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = OrderedDict(
    [
        # Model for Seq2Seq Causal LM mapping
+        (M2M100Config, M2M100ForConditionalGeneration),
        (LEDConfig, LEDForConditionalGeneration),
        (BlenderbotSmallConfig, BlenderbotSmallForConditionalGeneration),
        (MT5Config, MT5ForConditionalGeneration),
--- a/src/transformers/models/m2m_100/init.py
+++ b/src/transformers/models/m2m_100/init.py
@@ -0,0 +1,67 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...file_utils import _BaseLazyModule, is_tokenizers_available, is_torch_available
+
+
+_import_structure = {
+    "configuration_m2m_100": ["M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP", "M2M100Config"],
+    "tokenization_m2m_100": ["M2M100Tokenizer"],
+}
+
+
+if is_torch_available():
+    _import_structure["modeling_m2m_100"] = [
+        "M2M_100_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "M2M100ForConditionalGeneration",
+        "M2M100Model",
+        "M2M100PreTrainedModel",
+    ]
+
+
+if TYPE_CHECKING:
+    from .configuration_m2m_100 import M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP, M2M100Config
+    from .tokenization_m2m_100 import M2M100Tokenizer
+
+    if is_torch_available():
+        from .modeling_m2m_100 import (
+            M2M_100_PRETRAINED_MODEL_ARCHIVE_LIST,
+            M2M100ForConditionalGeneration,
+            M2M100Model,
+            M2M100PreTrainedModel,
+        )
+
+
+else:
+    import importlib
+    import os
+    import sys
+
+    class _LazyModule(_BaseLazyModule):
+        """
+        Module class that surfaces all objects but only performs associated imports when the objects are requested.
+        """
+
+        __file__ = globals()["__file__"]
+        __path__ = [os.path.dirname(__file__)]
+
+        def _get_module(self, module_name: str):
+            return importlib.import_module("." + module_name, self.__name__)
+
+    sys.modules[__name__] = _LazyModule(__name__, _import_structure)
--- a/src/transformers/models/m2m_100/configuration_m2m_100.py
+++ b/src/transformers/models/m2m_100/configuration_m2m_100.py
@@ -0,0 +1,165 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" M2M100 model configuration """
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+M2M_100_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/config.json",
+    # See all M2M100 models at https://huggingface.co/models?filter=m2m_100
+}
+
+
+class M2M100Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.M2M100Model`. It is used to
+    instantiate an M2M100 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the M2M100 `m2m100_418M
+    <https://huggingface.co/facebook/m2m100_418M>`__ architecture.
+
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+
+
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 50265):
+            Vocabulary size of the M2M100 model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.M2M100Model` or
+        d_model (:obj:`int`, `optional`, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (:obj:`int`, `optional`, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (:obj:`int`, `optional`, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (:obj:`str` or :obj:`function`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        dropout (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        classifier_dropout (:obj:`float`, `optional`, defaults to 0.0):
+            The dropout ratio for classifier.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        decoder_layerdrop: (:obj:`float`, `optional`, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the `LayerDrop paper <see
+            https://arxiv.org/abs/1909.11556>`__ for more details.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        gradient_checkpointing (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            If True, use gradient checkpointing to save memory at the expense of slower backward pass.
+
+        Example::
+
+            >>> from transformers import M2M100Model, M2M100Config
+
+            >>> # Initializing a M2M100 facebook/m2m100_418M style configuration
+            >>> configuration = M2M100Config()
+
+            >>> # Initializing a model from the facebook/m2m100_418M style configuration
+            >>> model = M2M100Model(configuration)
+
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
+    """
+    model_type = "m2m_100"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=128112,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.05,
+        decoder_layerdrop=0.05,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=2,
+        scale_embedding=True,
+        gradient_checkpointing=False,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.gradient_checkpointing = gradient_checkpointing
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
--- a/src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py
+++ b/src/transformers/models/m2m_100/convert_m2m100_original_checkpoint_to_pytorch.py
@@ -0,0 +1,85 @@
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+
+import torch
+from torch import nn
+
+from transformers import M2M100Config, M2M100ForConditionalGeneration
+
+
+def remove_ignore_keys_(state_dict):
+    ignore_keys = [
+        "encoder.version",
+        "decoder.version",
+        "model.encoder.version",
+        "model.decoder.version",
+        "decoder.output_projection.weight",
+        "_float_tensor",
+        "encoder.embed_positions._float_tensor",
+        "decoder.embed_positions._float_tensor",
+    ]
+    for k in ignore_keys:
+        state_dict.pop(k, None)
+
+
+def make_linear_from_emb(emb):
+    vocab_size, emb_size = emb.weight.shape
+    lin_layer = nn.Linear(vocab_size, emb_size, bias=False)
+    lin_layer.weight.data = emb.weight.data
+    return lin_layer
+
+
+def convert_fairseq_m2m100_checkpoint_from_disk(checkpoint_path):
+    m2m_100 = torch.load(checkpoint_path, map_location="cpu")
+    args = m2m_100["args"]
+    state_dict = m2m_100["model"]
+    remove_ignore_keys_(state_dict)
+    vocab_size = state_dict["encoder.embed_tokens.weight"].shape[0]
+
+    config = M2M100Config(
+        vocab_size=vocab_size,
+        max_position_embeddings=1024,
+        encoder_layers=args.encoder_layers,
+        decoder_layers=args.decoder_layers,
+        encoder_attention_heads=args.encoder_attention_heads,
+        decoder_attention_heads=args.decoder_attention_heads,
+        encoder_ffn_dim=args.encoder_ffn_embed_dim,
+        decoder_ffn_dim=args.decoder_ffn_embed_dim,
+        d_model=args.encoder_embed_dim,
+        encoder_layerdrop=args.encoder_layerdrop,
+        decoder_layerdrop=args.decoder_layerdrop,
+        dropout=args.dropout,
+        attention_dropout=args.attention_dropout,
+        activation_dropout=args.activation_dropout,
+        activation_function="relu",
+    )
+
+    state_dict["shared.weight"] = state_dict["decoder.embed_tokens.weight"]
+    model = M2M100ForConditionalGeneration(config)
+    model.model.load_state_dict(state_dict)
+    model.lm_head = make_linear_from_emb(model.model.shared)
+
+    return model
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # Required parameters
+    parser.add_argument("fairseq_path", type=str, help="path to a model.pt on local filesystem.")
+    parser.add_argument("pytorch_dump_folder_path", default=None, type=str, help="Path to the output PyTorch model.")
+    args = parser.parse_args()
+    model = convert_fairseq_m2m100_checkpoint_from_disk(args.fairseq_pathß)
+    model.save_pretrained(args.pytorch_dump_folder_path)
--- a/src/transformers/models/m2m_100/modeling_m2m_100.py
+++ b/src/transformers/models/m2m_100/modeling_m2m_100.py
--- a/src/transformers/models/m2m_100/tokenization_m2m_100.py
+++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py
@@ -0,0 +1,334 @@
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for M2M100."""
+import json
+from contextlib import contextmanager
+from pathlib import Path
+from shutil import copyfile
+from typing import Dict, List, Optional, Tuple, Union
+
+import sentencepiece
+
+from ...tokenization_utils import BatchEncoding, PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "spm_file": "sentencepiece.bpe.model",
+    "tokenizer_config_file": "tokenizer_config.json",
+}
+
+PRETRAINED_VOCAB_FILES_MAP = {
+    "vocab_file": {
+        "facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/vocab.json",
+    },
+    "spm_file": {
+        "facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/sentencepiece.bpe.model",
+    },
+    "tokenizer_config_file": {
+        "facebook/m2m100_418M": "https://huggingface.co/facebook/m2m100_418M/resolve/main/tokenizer_config.json",
+    },
+}
+
+ALL_M2M100_MODELS = ["facebook/m2m100_418M", "facebook/m2m100_1.2B"]
+SPM_URL = "https://huggingface.co/facebook/m2m100_418M/resolve/main/sentence.bpe.model"
+
+# fmt: off
+FAIRSEQ_LANGUAGE_CODES = ["af", "am", "ar", "ast", "az", "ba", "be", "bg", "bn", "br", "bs", "ca", "ceb", "cs", "cy", "da", "de", "el", "en", "es", "et", "fa", "ff", "fi", "fr", "fy", "ga", "gd", "gl", "gu", "ha", "he", "hi", "hr", "ht", "hu", "hy", "id", "ig", "ilo", "is", "it", "ja", "jv", "ka", "kk", "km", "kn", "ko", "lb", "lg", "ln", "lo", "lt", "lv", "mg", "mk", "ml", "mn", "mr", "ms", "my", "ne", "nl", "no", "ns", "oc", "or", "pa", "pl", "ps", "pt", "ro", "ru", "sd", "si", "sk", "sl", "so", "sq", "sr", "ss", "su", "sv", "sw", "ta", "th", "tl", "tn", "tr", "uk", "ur", "uz", "vi", "wo", "xh", "yi", "yo", "zh", "zu"]
+# fmt: on
+
+
+class M2M100Tokenizer(PreTrainedTokenizer):
+    """
+    Construct an M2M100 tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            Path to the vocabulary file.
+        spm_file (:obj:`str`):
+            Path to `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a .spm extension)
+            that contains the vocabulary.
+        src_lang (:obj:`str`, `optional`):
+            A string representing the source language.
+        tgt_lang (:obj:`str`, `optional`):
+            A string representing the target language.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+
+    Examples::
+
+        >>> from transformers import M2M100Tokenizer
+        >>> tokenizer = M2M100Tokenizer.from_pretrained("facebook/m2m100_418M, src_lang="en", tgt_lang="ro")
+        >>> src_text = " UN Chief Says There Is No Military Solution in Syria"
+        >>> tgt_text =  "Şeful ONU declară că nu există o soluţie militară în Siria"
+        >>> model_inputs = tokenizer(src_text, return_tensors="pt")
+        >>> with tokenizer.as_target_tokenizer():
+        ...    labels = tokenizer(tgt_text, return_tensors="pt").input_ids
+        >>> # model(**model_inputs, labels=labels) should work
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    max_model_input_sizes = {m: 1024 for m in ALL_M2M100_MODELS}
+    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
+    model_input_names = ["input_ids", "attention_mask"]
+
+    prefix_tokens: List[int] = []
+    suffix_tokens: List[int] = []
+
+    def __init__(
+        self,
+        vocab_file,
+        spm_file,
+        src_lang=None,
+        tgt_lang=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        pad_token="<pad>",
+        unk_token="<unk>",
+        **kwargs,
+    ):
+        super().__init__(
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+        self.encoder = load_json(vocab_file)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.spm_file = spm_file
+        self.sp_model = load_spm(spm_file)
+
+        self.encoder_size = len(self.encoder)
+
+        self.lang_code_to_token = {lang_code: f"__{lang_code}__" for lang_code in FAIRSEQ_LANGUAGE_CODES}
+
+        self.lang_token_to_id = {
+            self.get_lang_token(lang_code): self.encoder_size + i for i, lang_code in enumerate(FAIRSEQ_LANGUAGE_CODES)
+        }
+        self.lang_code_to_id = {lang_code: self.encoder_size + i for i, lang_code in enumerate(FAIRSEQ_LANGUAGE_CODES)}
+        self.id_to_lang_token = {v: k for k, v in self.lang_token_to_id.items()}
+        self._additional_special_tokens = list(self.lang_token_to_id.keys())
+
+        self._src_lang = src_lang if src_lang is not None else "en"
+        self.tgt_lang = tgt_lang
+        self.cur_lang_id = self.get_lang_id(self._src_lang)
+        self.set_src_lang_special_tokens(self._src_lang)
+
+        self.num_madeup_words = 8
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.encoder) + len(self.lang_token_to_id) + self.num_madeup_words
+
+    @property
+    def src_lang(self) -> str:
+        return self._src_lang
+
+    @src_lang.setter
+    def src_lang(self, new_src_lang: str) -> None:
+        self._src_lang = new_src_lang
+        self.set_src_lang_special_tokens(self._src_lang)
+
+    def _tokenize(self, text: str) -> List[str]:
+        return self.sp_model.EncodeAsPieces(text)
+
+    def _convert_token_to_id(self, token):
+        if token in self.lang_token_to_id:
+            return self.lang_token_to_id[token]
+        return self.encoder.get(token, self.encoder[self.unk_token])
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the decoder."""
+        if index in self.id_to_lang_token:
+            return self.id_to_lang_token[index]
+        return self.decoder.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def get_special_tokens_mask(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
+    ) -> List[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer ``prepare_for_model`` method.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1] * len(self.suffix_tokens)
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
+    ) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An MBART sequence has the following format, where ``X`` represents the sequence:
+
+        - ``input_ids`` (for encoder) ``X [eos, src_lang_code]``
+        - ``decoder_input_ids``: (for decoder) ``X [eos, tgt_lang_code]``
+
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+
+        Args:
+            token_ids_0 (:obj:`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (:obj:`List[int]`, `optional`):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+
+    def get_vocab(self) -> Dict:
+        vocab = self.encoder.copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self) -> Dict:
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d: Dict) -> None:
+        self.__dict__ = d
+        self.sp_model = load_spm(self.spm_file)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        save_dir = Path(save_directory)
+        assert save_dir.is_dir(), f"{save_directory} should be a directory"
+        vocab_save_path = save_dir / (
+            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["vocab_file"]
+        )
+        spm_save_path = save_dir / (
+            (filename_prefix + "-" if filename_prefix else "") + self.vocab_files_names["spm_file"]
+        )
+
+        save_json(self.encoder, vocab_save_path)
+
+        if not spm_save_path.exists():
+            copyfile(self.spm_file, spm_save_path)
+
+        return (str(vocab_save_path), str(spm_save_path))
+
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: List[str],
+        src_lang: str = "en",
+        tgt_texts: Optional[List[str]] = None,
+        tgt_lang: str = "ro",
+        **kwargs,
+    ) -> BatchEncoding:
+        self.src_lang = src_lang
+        self.tgt_lang = tgt_lang
+        self.set_src_lang_special_tokens(self.src_lang)
+        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
+
+    @contextmanager
+    def as_target_tokenizer(self):
+        """
+        Temporarily sets the tokenizer for encoding the targets. Useful for tokenizer associated to
+        sequence-to-sequence models that need a slightly different processing for the labels.
+        """
+        self.set_tgt_lang_special_tokens(self.tgt_lang)
+        yield
+        self.set_src_lang_special_tokens(self.src_lang)
+
+    def set_src_lang_special_tokens(self, src_lang: str) -> None:
+        """Reset the special tokens to the source lang setting. No prefix and suffix=[eos, src_lang_code]."""
+        lang_token = self.get_lang_token(src_lang)
+        self.cur_lang_id = self.lang_token_to_id[lang_token]
+        self.prefix_tokens = [self.cur_lang_id]
+        self.suffix_tokens = [self.eos_token_id]
+
+    def set_tgt_lang_special_tokens(self, tgt_lang: str) -> None:
+        """Reset the special tokens to the target language setting. No prefix and suffix=[eos, tgt_lang_code]."""
+        lang_token = self.get_lang_token(tgt_lang)
+        self.cur_lang_id = self.lang_token_to_id[lang_token]
+        self.prefix_tokens = [self.cur_lang_id]
+        self.suffix_tokens = [self.eos_token_id]
+
+    def get_lang_token(self, lang: str) -> str:
+        return self.lang_code_to_token[lang]
+
+    def get_lang_id(self, lang: str) -> int:
+        lang_token = self.get_lang_token(lang)
+        return self.lang_token_to_id[lang_token]
+
+
+def load_spm(path: str) -> sentencepiece.SentencePieceProcessor:
+    spm = sentencepiece.SentencePieceProcessor()
+    spm.Load(str(path))
+    return spm
+
+
+def load_json(path: str) -> Union[Dict, List]:
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def save_json(data, path: str) -> None:
+    with open(path, "w") as f:
+        json.dump(data, f, indent=2)