[Pegasus] Refactor Tokenizer (#8731)

* refactor * further refactor * fix the rest tomorrow * save intermediate * finish slow tokenizer * make more tests pass * finish refactor * fix comment * clean further * fix name * fix naming * Update src/transformers/models/reformer/tokenization_reformer.py * Apply suggestions from code review * Apply suggestions from code review * refactor * fix init tokenizers * refactor * improve convert * refactor * correct convert slow tokenizer * final fix for Pegasus Tok * remove ipdb * improve links
2020-11-29 16:57:43 +01:00
parent 36b60ce9e8
commit 5ced23dc84
11 changed files with 327 additions and 67 deletions
--- a/src/transformers/convert_slow_tokenizer.py
+++ b/src/transformers/convert_slow_tokenizer.py
@@ -547,10 +547,12 @@ class BertGenerationConverter(SpmConverter):
 class PegasusConverter(SpmConverter):
    def vocab(self, proto):
        vocab = [
-            (self.original_tokenizer.pad_token, 0),
-            (self.original_tokenizer.eos_token, 0),
+            (self.original_tokenizer.pad_token, 0.0),
+            (self.original_tokenizer.eos_token, 0.0),
+            (self.original_tokenizer.mask_token_sent, 0.0),
+            (self.original_tokenizer.mask_token, 0.0),
        ]
-        vocab += [(f"unk_{i}", -100) for i in range(2, 2 + self.original_tokenizer.offset)]
+        vocab += [(f"<unk_{i}>", -100.0) for i in range(2, self.original_tokenizer.offset)]
        vocab += [(piece.piece, piece.score) for piece in proto.pieces[2:]]
        return vocab

@@ -559,13 +561,10 @@ class PegasusConverter(SpmConverter):

    def post_processor(self):
        eos = self.original_tokenizer.eos_token
-        return processors.TemplateProcessing(
-            single=["$A", eos],
-            pair=["$A", "$B", eos],
        special_tokens = [
            (eos, self.original_tokenizer.eos_token_id),
-            ],
-        )
+        ]
+        return processors.TemplateProcessing(single=["$A", eos], pair=["$A", "$B", eos], special_tokens=special_tokens)


 class T5Converter(SpmConverter):
--- a/src/transformers/models/albert/tokenization_albert_fast.py
+++ b/src/transformers/models/albert/tokenization_albert_fast.py
@@ -71,10 +71,10 @@ SPIECE_UNDERLINE = "▁"

 class AlbertTokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
-    <https://github.com/google/sentencepiece>`__. This tokenizer inherits from
-    :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should refer to this
-    superclass for more information regarding those methods
+    Construct a "fast" ALBERT tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__. This tokenizer
+    inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods

    Args:
        vocab_file (:obj:`str`):
--- a/src/transformers/models/camembert/tokenization_camembert_fast.py
+++ b/src/transformers/models/camembert/tokenization_camembert_fast.py
@@ -60,8 +60,8 @@ SPIECE_UNDERLINE = "▁"
 class CamembertTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" CamemBERT tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
-    :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `SentencePiece
-    <https://github.com/google/sentencepiece>`__.
+    :class:`~transformers.RobertaTokenizer` and :class:`~transformers.XLNetTokenizer`. Based on `BPE
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.
--- a/src/transformers/models/mbart/tokenization_mbart_fast.py
+++ b/src/transformers/models/mbart/tokenization_mbart_fast.py
@@ -67,7 +67,8 @@ FAIRSEQ_LANGUAGE_CODES = [

 class MBartTokenizerFast(XLMRobertaTokenizerFast):
    """
-    Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library).
+    Construct a "fast" MBART tokenizer (backed by HuggingFace's `tokenizers` library). Based on `BPE
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.

    :class:`~transformers.MBartTokenizerFast` is a subclass of :class:`~transformers.XLMRobertaTokenizerFast` and adds
    a new :meth:`~transformers.MBartTokenizerFast.prepare_seq2seq_batch`.
--- a/src/transformers/models/pegasus/tokenization_pegasus.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus.py
@@ -12,11 +12,16 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Dict, List, Optional
+import os
+from shutil import copyfile
+from typing import Dict, List, Optional, Tuple
+
+import sentencepiece as spm

 from ...file_utils import add_start_docstrings
+from ...tokenization_utils import PreTrainedTokenizer
 from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
-from ..reformer.tokenization_reformer import ReformerTokenizer
+from ...utils import logging


 SPIECE_UNDERLINE = "▁"
@@ -32,31 +37,145 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 }


-class PegasusTokenizer(ReformerTokenizer):
+logger = logging.get_logger(__name__)
+
+
+class PegasusTokenizer(PreTrainedTokenizer):
    r"""
-    Construct a Pegasus tokenizer.
+    Construct a PEGASUS tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.

-    :class:`~transformers.PegasusTokenizer` is identical to :class:`~transformers.ReformerTokenizer` and adds a new
-    :meth:`~transformers.PegasusTokenizer.prepare_seq2seq_batch`
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.

-    Refer to superclass :class:`~transformers.ReformerTokenizer` for usage examples and documentation concerning the
-    initialization parameters and other methods.
+    Args:
+        vocab_file (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end of
+                sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
+            The token used for masking single token values. This is the token used when training this model with masked
+            language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
+            It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
+            Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
+        mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
+            The token used for masking whole target sentences. This is the token used when training this model with gap
+            sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
+            pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
+            Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
+        additional_special_tokens (:obj:`List[str]`, `optional`):
+            Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
+            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
+            tokenizer
+            <https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
+            that uses the tokens 2 - 104 only for pretraining
    """
+    vocab_files_names = VOCAB_FILES_NAMES
+
    offset = 103  # entries 2 - 104 are only used for pretraining
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
+    model_input_names = ["attention_mask"]

-    def __init__(self, *args, pad_token="<pad>", **kwargs):
-        super().__init__(*args, **kwargs, pad_token="<pad>")
-        # Don't use reserved words added_token_encoder, added_tokens_decoder because of
-        # AssertionError: Non-consecutive added token '1' found. in from_pretrained
-        assert len(self.added_tokens_decoder) == 0
-        self.encoder: Dict[int, str] = {0: self.pad_token, 1: self.eos_token}
-        # entries 2-104 are only used for pretraining and called unk_2, ...unk_104
-        self.encoder.update({i: f"unk_{i}" for i in range(2, self.offset + 2)})
+    def __init__(
+        self,
+        vocab_file,
+        pad_token="<pad>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        mask_token="<mask_2>",
+        mask_token_sent="<mask_1>",
+        additional_special_tokens=None,
+        **kwargs
+    ):
+        if additional_special_tokens is not None:
+            assert isinstance(
+                additional_special_tokens, list
+            ), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
+
+            additional_special_tokens_extended = (
+                ([mask_token_sent] + additional_special_tokens)
+                if mask_token_sent not in additional_special_tokens
+                else additional_special_tokens
+            )
+            # fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
+            additional_special_tokens_extended += [
+                f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
+            ]
+
+            if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
+                raise ValueError(
+                    f"Please make sure that the provided additional_special_tokens do not contain an incorrectly shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
+                )
+            additional_special_tokens = additional_special_tokens_extended
+        else:
+            additional_special_tokens = [mask_token_sent]
+            additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
+
+        super().__init__(
+            eos_token=eos_token,
+            unk_token=unk_token,
+            mask_token=mask_token,
+            pad_token=pad_token,
+            mask_token_sent=mask_token_sent,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(vocab_file)
+        self.mask_token_sent = mask_token_sent
+
+        # add special tokens to encoder dict
+        self.encoder: Dict[int, str] = {
+            0: self.pad_token,
+            1: self.eos_token,
+            2: self.mask_token_sent,
+            3: self.mask_token,
+        }
+        # entries 2-104 are only used for pretraining and called <mask_1>, <mask_2>, unk_2, ...unk_102
+        # mask_token_sent is already added to list -> so start at 1
+        self.encoder.update({i + 3: additional_special_tokens[i] for i in range(1, self.offset - 1)})
        self.decoder: Dict[str, int] = {v: k for k, v in self.encoder.items()}

+    @property
+    def vocab_size(self) -> int:
+        return len(self.sp_model) + self.offset
+
+    def get_vocab(self) -> Dict[str, int]:
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        self.sp_model = spm.SentencePieceProcessor()
+        self.sp_model.Load(self.vocab_file)
+
+    def _tokenize(self, text, sample=False):
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        if not sample:
+            pieces = self.sp_model.EncodeAsPieces(text)
+        else:
+            pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
+        return pieces
+
    def _convert_token_to_id(self, token: str) -> int:
        """ Converts a token (str) to an id using the vocab. """
        if token in self.decoder:
@@ -73,13 +192,13 @@ class PegasusTokenizer(ReformerTokenizer):
        elif index in self.added_tokens_encoder:
            return self.added_tokens_encoder[index]
        else:
-            # assert index > self.offset, f"cannot decode ids between 2 and {self.offset}. Got {index}"
            token = self.sp_model.IdToPiece(index - self.offset)
        return token

-    @property
-    def vocab_size(self) -> int:
-        return len(self.sp_model) + self.offset
+    def convert_tokens_to_string(self, tokens):
+        """ Converts a sequence of tokens (string) in a single string. """
+        out_string = self.sp_model.decode_pieces(tokens)
+        return out_string

    def num_special_tokens_to_add(self, pair=False):
        """Just EOS"""
@@ -88,7 +207,11 @@ class PegasusTokenizer(ReformerTokenizer):
    def _special_token_mask(self, seq):
        all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
        all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
-        assert all_special_ids == set([0, 1])
+
+        assert all_special_ids == set(
+            range(len(self.additional_special_tokens) + 3)
+        ), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
+
        return [1 if x in all_special_ids else 0 for x in seq]

    def get_special_tokens_mask(
@@ -105,7 +228,7 @@ class PegasusTokenizer(ReformerTokenizer):
    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
-        and adding special tokens. A Pegasus sequence has the following format, where ``X`` represents the sequence:
+        and adding special tokens. A PEGASUS sequence has the following format, where ``X`` represents the sequence:

        - single sequence: ``X </s>``
        - pair of sequences: ``A B </s>`` (not intended use)
@@ -156,3 +279,16 @@ class PegasusTokenizer(ReformerTokenizer):
        labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
        model_inputs["labels"] = labels
        return model_inputs
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
--- a/src/transformers/models/pegasus/tokenization_pegasus_fast.py
+++ b/src/transformers/models/pegasus/tokenization_pegasus_fast.py
@@ -12,11 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import List, Optional
+""" Tokenization class for model PEGASUS."""
+
+
+import os
+from shutil import copyfile
+from typing import List, Optional, Tuple

 from ...file_utils import add_start_docstrings, is_sentencepiece_available
 from ...tokenization_utils_base import PREPARE_SEQ2SEQ_BATCH_DOCSTRING, BatchEncoding
-from ..reformer.tokenization_reformer_fast import ReformerTokenizerFast
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import logging


 if is_sentencepiece_available():
@@ -25,6 +31,9 @@ else:
    PegasusTokenizer = None


+logger = logging.get_logger(__name__)
+
+
 SPIECE_UNDERLINE = "▁"

 VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
@@ -39,21 +48,112 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 }


-class PegasusTokenizerFast(ReformerTokenizerFast):
+class PegasusTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.
+
+    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
+    Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (:obj:`str`):
+            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
+            The end of sequence token.
+
+            .. note::
+
+                When building a sequence using special tokens, this is not the token that is used for the end of
+                sequence. The token used is the :obj:`sep_token`.
+        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask_2>"`):
+            The token used for masking single token values. This is the token used when training this model with masked
+            language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
+            It corresponds to `[MASK2]` in `PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
+            Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
+        mask_token_sent (:obj:`str`, `optional`, defaults to :obj:`"<mask_1>"`):
+            The token used for masking whole target sentences. This is the token used when training this model with gap
+            sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
+            pretraining. It corresponds to `[MASK1]` in `PEGASUS: Pre-training with Extracted Gap-sentences for
+            Abstractive Summarization <https://arxiv.org/pdf/1912.08777.pdf>`__.
+        additional_special_tokens (:obj:`List[str]`, `optional`):
+            Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
+            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the `original PEGASUS
+            tokenizer
+            <https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
+            that uses the tokens 2 - 104 only for pretraining
+    """
    offset = 103  # entries 2-104 are only used for pretraining
    vocab_files_names = VOCAB_FILES_NAMES
    pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    slow_tokenizer_class = PegasusTokenizer
+    model_input_names = ["attention_mask"]

-    # def num_special_tokens_to_add(self, pair=False):
-    #     """Just EOS"""
-    #     return 1
+    def __init__(
+        self,
+        vocab_file,
+        tokenizer_file=None,
+        pad_token="<pad>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        mask_token="<mask_2>",
+        mask_token_sent="<mask_1>",
+        additional_special_tokens=None,
+        **kwargs
+    ):
+        if additional_special_tokens is not None:
+            assert isinstance(
+                additional_special_tokens, list
+            ), f"additional_special_tokens should be of type {type(list)}, but is {type(additional_special_tokens)}"
+
+            additional_special_tokens_extended = (
+                ([mask_token_sent] + additional_special_tokens)
+                if mask_token_sent not in additional_special_tokens
+                else additional_special_tokens
+            )
+            # fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
+            additional_special_tokens_extended += [
+                f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
+            ]
+
+            if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
+                raise ValueError(
+                    f"Please make sure that the provided additional_special_tokens do not contain an incorrectly shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
+                )
+            additional_special_tokens = additional_special_tokens_extended
+        else:
+            additional_special_tokens = [mask_token_sent]
+            additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
+
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            pad_token=pad_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            mask_token=mask_token,
+            mask_token_sent=mask_token_sent,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file

    def _special_token_mask(self, seq):
        all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
        all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
-        assert all_special_ids == set([0, 1])
+
+        assert all_special_ids == set(
+            range(len(self.additional_special_tokens) + 3)
+        ), f"There should be 3 special tokens: mask_token, pad_token, and eos_token + {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
+
        return [1 if x in all_special_ids else 0 for x in seq]

    def get_special_tokens_mask(
@@ -117,3 +217,16 @@ class PegasusTokenizerFast(ReformerTokenizerFast):
        labels: BatchEncoding = self(tgt_texts, **tokenizer_kwargs)["input_ids"]
        model_inputs["labels"] = labels
        return model_inputs
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
--- a/src/transformers/models/reformer/tokenization_reformer_fast.py
+++ b/src/transformers/models/reformer/tokenization_reformer_fast.py
@@ -64,8 +64,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class ReformerTokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
-    <https://github.com/google/sentencepiece>`__ .
+    Construct a "fast" Reformer tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.
--- a/src/transformers/models/t5/tokenization_t5_fast.py
+++ b/src/transformers/models/t5/tokenization_t5_fast.py
@@ -75,8 +75,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {

 class T5TokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
-    <https://github.com/google/sentencepiece>`__ .
+    Construct a "fast" T5 tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.
--- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
+++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py
@@ -66,8 +66,8 @@ PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
 class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
    """
    Construct a "fast" XLM-RoBERTa tokenizer (backed by HuggingFace's `tokenizers` library). Adapted from
-    :class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `SentencePiece
-    <https://github.com/google/sentencepiece>`__.
+    :class:`~transfomers.RobertaTokenizer` and class:`~transfomers.XLNetTokenizer`. Based on `BPE
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models>`__.

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.
--- a/src/transformers/models/xlnet/tokenization_xlnet_fast.py
+++ b/src/transformers/models/xlnet/tokenization_xlnet_fast.py
@@ -62,8 +62,8 @@ SEG_ID_PAD = 4

 class XLNetTokenizerFast(PreTrainedTokenizerFast):
    """
-    Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on `SentencePiece
-    <https://github.com/google/sentencepiece>`__.
+    Construct a "fast" XLNet tokenizer (backed by HuggingFace's `tokenizers` library). Based on `Unigram
+    <https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models>`__.

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the main
    methods. Users should refer to this superclass for more information regarding those methods.
--- a/tests/test_tokenization_pegasus.py
+++ b/tests/test_tokenization_pegasus.py
@@ -26,21 +26,34 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        tokenizer.save_pretrained(self.tmpdirname)

    @cached_property
-    def pegasus_large_tokenizer(self):
+    def _large_tokenizer(self):
        return PegasusTokenizer.from_pretrained("google/pegasus-large")

-    @unittest.skip("add_tokens does not work yet")
-    def test_swap_special_token(self):
-        pass
-
    def get_tokenizer(self, **kwargs) -> PegasusTokenizer:
        return PegasusTokenizer.from_pretrained(self.tmpdirname, **kwargs)

    def get_input_output_texts(self, tokenizer):
        return ("This is a test", "This is a test")

-    def test_pegasus_large_tokenizer_settings(self):
-        tokenizer = self.pegasus_large_tokenizer
+    def test_mask_tokens_rust_pegasus(self):
+        rust_tokenizer = self.rust_tokenizer_class.from_pretrained(self.tmpdirname)
+        py_tokenizer = self.tokenizer_class.from_pretrained(self.tmpdirname)
+        raw_input_str = "Let's see which <unk> is the better <unk_token_11> one <mask_1> It seems like this <mask_2> was important </s> <pad> <pad> <pad>"
+        rust_ids = rust_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
+        py_ids = py_tokenizer([raw_input_str], return_tensors=None, add_special_tokens=False).input_ids[0]
+        # TODO: (Thom, Patrick) - this fails because the rust tokenizer does not know about the <mask_1>, <mask_2>, and those <unk_token_x> yet
+        self.assertListEqual(py_ids, rust_ids)
+
+    def test_large_mask_tokens(self):
+        tokenizer = self._large_tokenizer
+        # <mask_1> masks whole sentence while <mask_2> masks single word
+        raw_input_str = "<mask_1> To ensure a <mask_2> flow of bank resolutions."
+        desired_result = [2, 413, 615, 114, 3, 1971, 113, 1679, 10710, 107, 1]
+        ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
+        self.assertListEqual(desired_result, ids)
+
+    def test_large_tokenizer_settings(self):
+        tokenizer = self._large_tokenizer
        # The tracebacks for the following asserts are **better** without messages or self.assertEqual
        assert tokenizer.vocab_size == 96103
        assert tokenizer.pad_token_id == 0
@@ -48,20 +61,18 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
        assert tokenizer.offset == 103
        assert tokenizer.unk_token_id == tokenizer.offset + 2 == 105
        assert tokenizer.unk_token == "<unk>"
-        assert tokenizer.mask_token is None
-        assert tokenizer.mask_token_id is None
        assert tokenizer.model_max_length == 1024
        raw_input_str = "To ensure a smooth flow of bank resolutions."
        desired_result = [413, 615, 114, 2291, 1971, 113, 1679, 10710, 107, 1]
        ids = tokenizer([raw_input_str], return_tensors=None).input_ids[0]
        self.assertListEqual(desired_result, ids)
-        assert tokenizer.convert_ids_to_tokens([0, 1, 2]) == ["<pad>", "</s>", "unk_2"]
+        assert tokenizer.convert_ids_to_tokens([0, 1, 2, 3]) == ["<pad>", "</s>", "<mask_1>", "<mask_2>"]

    @require_torch
-    def test_pegasus_large_seq2seq_truncation(self):
+    def test_large_seq2seq_truncation(self):
        src_texts = ["This is going to be way too long." * 150, "short example"]
        tgt_texts = ["not super long but more than 5 tokens", "tiny"]
-        batch = self.pegasus_large_tokenizer.prepare_seq2seq_batch(
+        batch = self._large_tokenizer.prepare_seq2seq_batch(
            src_texts, tgt_texts=tgt_texts, max_target_length=5, return_tensors="pt"
        )
        assert batch.input_ids.shape == (2, 1024)