From 37ed3ab719f10dc00bf63ac343b441bf78bb1eee Mon Sep 17 00:00:00 2001 From: Philip May Date: Thu, 13 May 2021 08:44:55 +0200 Subject: [PATCH] Enable option for subword regularization in more tokenizers. (#11417) * improve slow class tok usage at xlm rob * add subword regularization for barthez * improve barthez tok. test * fix tokenizer tests * add subword regularization for camembert * add subword regularization for deberta v2 tokenizer * add more doc to deberta v2 tokenizer * add subword regularization for speech to text tok. * fix sp_model_kwargs type in speech 2 text tok. * add subword regularization for M2M100 tok. * add more concrete type hints * fix tests for m2m100 and s2t tok. * add missing Any import * fix syntax error in m2m100 tok. * fix unpickle of m2m100 and s2t tok. * fix test of m2m100 and s2t tok. * improve unpickle of deberta v2 tok. * add test for pickle of barthez & camembert * fix pickle of barthez & camembert * add test for deberta v2 tok. pickle * fix m2m100 tok. pickle * fix s2t tok. pickle * add subword regularization to albert tok. * refactor subword reg. test into TokenizerTesterMixin improve albert tok. test remove sample argument form albert tok. check subword reg. using TokenizerTesterMixin improve tok. tests improve xlm roberta tok. tests improve xlm roberta tok. tests * add subword regularization for big bird t. * improve xlm roberta tok. test * add subword regularization for mbart50 tok. * add subword regularization for pegasus tok. * add subword regularization for reformer tok. * add subword regularization for T5 tok. * fix t5 tok. test formatting * add subword regularization for xlm_proph. tok. * add subword regularization for xlnet tok. * add subword regularization for gert_gen tok. * add typing to tokenizers * add typing to xlm rob. tok * add subword regularization for marian tok. * add reverse tok. test * fix marian tok test * fix marian tok test * fix casing in tok. tests * fix style of tok. common test * fix deberta v2 tok test * add type annotations to tok. tests * add type annotations to tok. __init__ * add typing to kokenizer * add type annotations to tok. __init__ * don't specify the default when it's None * fix barthez tok. doc * move sentencepiece tok. tests to TokenizerTesterMixin * fix unused imports * fix albert tok. test * add comment to sentencepiece test options * fix Any import at big bird tok. * fix Any import at xlm prophetnet tok. * empty commit to trigger CI --- .../models/albert/tokenization_albert.py | 39 ++++++--- .../models/barthez/tokenization_barthez.py | 35 ++++++-- .../tokenization_bert_generation.py | 39 ++++++--- .../models/big_bird/tokenization_big_bird.py | 38 ++++++--- .../camembert/tokenization_camembert.py | 35 ++++++-- .../deberta_v2/tokenization_deberta_v2.py | 65 ++++++++++++--- .../models/m2m_100/tokenization_m2m_100.py | 37 +++++++-- .../models/marian/tokenization_marian.py | 39 +++++++-- .../models/mbart/tokenization_mbart50.py | 33 ++++++-- .../models/pegasus/tokenization_pegasus.py | 40 +++++++--- .../models/reformer/tokenization_reformer.py | 46 ++++++++--- .../tokenization_speech_to_text.py | 38 +++++++-- src/transformers/models/t5/tokenization_t5.py | 39 ++++++--- .../tokenization_xlm_prophetnet.py | 35 ++++++-- .../xlm_roberta/tokenization_xlm_roberta.py | 10 +-- .../models/xlnet/tokenization_xlnet.py | 39 ++++++--- tests/test_tokenization_albert.py | 3 +- tests/test_tokenization_barthez.py | 4 +- tests/test_tokenization_bert_generation.py | 2 +- tests/test_tokenization_big_bird.py | 4 +- tests/test_tokenization_camembert.py | 2 +- tests/test_tokenization_common.py | 80 +++++++++++++++++++ tests/test_tokenization_deberta_v2.py | 3 +- tests/test_tokenization_m2m_100.py | 1 + tests/test_tokenization_marian.py | 2 +- tests/test_tokenization_mbart50.py | 1 + tests/test_tokenization_pegasus.py | 2 + tests/test_tokenization_reformer.py | 2 +- tests/test_tokenization_speech_to_text.py | 1 + tests/test_tokenization_t5.py | 2 +- tests/test_tokenization_xlm_prophetnet.py | 2 +- tests/test_tokenization_xlm_roberta.py | 39 +-------- tests/test_tokenization_xlnet.py | 2 +- 33 files changed, 578 insertions(+), 181 deletions(-) diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py index 493a5e145a..720c1d0847 100644 --- a/src/transformers/models/albert/tokenization_albert.py +++ b/src/transformers/models/albert/tokenization_albert.py @@ -18,7 +18,7 @@ import os import unicodedata from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -102,6 +102,20 @@ class AlbertTokenizer(PreTrainedTokenizer): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Attributes: sp_model (:obj:`SentencePieceProcessor`): @@ -125,11 +139,14 @@ class AlbertTokenizer(PreTrainedTokenizer): pad_token="", cls_token="[CLS]", mask_token="[MASK]", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( do_lower_case=do_lower_case, remove_space=remove_space, @@ -141,6 +158,7 @@ class AlbertTokenizer(PreTrainedTokenizer): pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) @@ -149,7 +167,7 @@ class AlbertTokenizer(PreTrainedTokenizer): self.keep_accents = keep_accents self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) @property @@ -168,7 +186,12 @@ class AlbertTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) def preprocess_text(self, inputs): @@ -186,14 +209,10 @@ class AlbertTokenizer(PreTrainedTokenizer): return outputs - def _tokenize(self, text, sample=False): + def _tokenize(self, text: str) -> List[str]: """Tokenize a string.""" text = self.preprocess_text(text) - - if not sample: - pieces = self.sp_model.EncodeAsPieces(text) - else: - pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) + pieces = self.sp_model.encode(text, out_type=str) new_pieces = [] for piece in pieces: if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py index 95d64cfa28..36bdbd7449 100644 --- a/src/transformers/models/barthez/tokenization_barthez.py +++ b/src/transformers/models/barthez/tokenization_barthez.py @@ -17,7 +17,7 @@ import os from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -89,6 +89,20 @@ class BarthezTokenizer(PreTrainedTokenizer): modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Attributes: sp_model (:obj:`SentencePieceProcessor`): @@ -110,11 +124,14 @@ class BarthezTokenizer(PreTrainedTokenizer): unk_token="", pad_token="", mask_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -123,11 +140,12 @@ class BarthezTokenizer(PreTrainedTokenizer): cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} @@ -219,8 +237,8 @@ class BarthezTokenizer(PreTrainedTokenizer): vocab.update(self.added_tokens_encoder) return vocab - def _tokenize(self, text): - return self.sp_model.EncodeAsPieces(text) + def _tokenize(self, text: str) -> List[str]: + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" @@ -243,7 +261,12 @@ class BarthezTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) def convert_tokens_to_string(self, tokens): diff --git a/src/transformers/models/bert_generation/tokenization_bert_generation.py b/src/transformers/models/bert_generation/tokenization_bert_generation.py index 795d5f504c..43676e2801 100644 --- a/src/transformers/models/bert_generation/tokenization_bert_generation.py +++ b/src/transformers/models/bert_generation/tokenization_bert_generation.py @@ -17,7 +17,7 @@ import os from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -58,6 +58,20 @@ class BertGenerationTokenizer(PreTrainedTokenizer): token instead. pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): The token used for padding, for example when batching sequences of different lengths. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. """ vocab_files_names = VOCAB_FILES_NAMES @@ -74,8 +88,11 @@ class BertGenerationTokenizer(PreTrainedTokenizer): unk_token="", pad_token="", sep_token="<::::>", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + # Add extra_ids to the special token list super().__init__( bos_token=bos_token, @@ -83,12 +100,13 @@ class BertGenerationTokenizer(PreTrainedTokenizer): unk_token=unk_token, pad_token=pad_token, sep_token=sep_token, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) @property @@ -107,16 +125,17 @@ class BertGenerationTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) - def _tokenize(self, text, sample=False): + def _tokenize(self, text: str) -> List[str]: """Take as input a string and return a list of strings (tokens) for words/sub-words""" - if not sample: - pieces = self.sp_model.EncodeAsPieces(text) - else: - pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) - return pieces + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py index e3e5a93f6d..92f652448d 100644 --- a/src/transformers/models/big_bird/tokenization_big_bird.py +++ b/src/transformers/models/big_bird/tokenization_big_bird.py @@ -17,7 +17,7 @@ import os from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -74,7 +74,20 @@ class BigBirdTokenizer(PreTrainedTokenizer): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. """ vocab_files_names = VOCAB_FILES_NAMES @@ -93,8 +106,9 @@ class BigBirdTokenizer(PreTrainedTokenizer): sep_token="[SEP]", mask_token="[MASK]", cls_token="[CLS]", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token @@ -105,6 +119,8 @@ class BigBirdTokenizer(PreTrainedTokenizer): # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -113,12 +129,13 @@ class BigBirdTokenizer(PreTrainedTokenizer): sep_token=sep_token, mask_token=mask_token, cls_token=cls_token, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) @property @@ -137,16 +154,17 @@ class BigBirdTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) - def _tokenize(self, text, sample=False): + def _tokenize(self, text: str) -> List[str]: """Take as input a string and return a list of strings (tokens) for words/sub-words""" - if not sample: - pieces = self.sp_model.EncodeAsPieces(text) - else: - pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) - return pieces + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py index b7bee4e19c..ff865c6acd 100644 --- a/src/transformers/models/camembert/tokenization_camembert.py +++ b/src/transformers/models/camembert/tokenization_camembert.py @@ -17,7 +17,7 @@ import os from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -85,6 +85,20 @@ class CamembertTokenizer(PreTrainedTokenizer): modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Attributes: sp_model (:obj:`SentencePieceProcessor`): @@ -107,11 +121,14 @@ class CamembertTokenizer(PreTrainedTokenizer): pad_token="", mask_token="", additional_special_tokens=["NOTUSED", "NOTUSED"], + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -121,9 +138,10 @@ class CamembertTokenizer(PreTrainedTokenizer): pad_token=pad_token, mask_token=mask_token, additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual @@ -218,8 +236,8 @@ class CamembertTokenizer(PreTrainedTokenizer): vocab.update(self.added_tokens_encoder) return vocab - def _tokenize(self, text): - return self.sp_model.EncodeAsPieces(text) + def _tokenize(self, text: str) -> List[str]: + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" @@ -243,7 +261,12 @@ class CamembertTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) def convert_tokens_to_string(self, tokens): diff --git a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py index ddb77c621b..66c97d4fe8 100644 --- a/src/transformers/models/deberta_v2/tokenization_deberta_v2.py +++ b/src/transformers/models/deberta_v2/tokenization_deberta_v2.py @@ -16,7 +16,7 @@ import os import unicodedata -from typing import Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as sp import six @@ -75,6 +75,20 @@ class DebertaV2Tokenizer(PreTrainedTokenizer): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. """ vocab_files_names = VOCAB_FILES_NAMES @@ -92,8 +106,11 @@ class DebertaV2Tokenizer(PreTrainedTokenizer): pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( do_lower_case=do_lower_case, unk_token=unk_token, @@ -102,6 +119,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer): cls_token=cls_token, mask_token=mask_token, split_by_punct=split_by_punct, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) @@ -112,7 +130,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer): ) self.do_lower_case = do_lower_case self.split_by_punct = split_by_punct - self._tokenizer = SPMTokenizer(vocab_file, split_by_punct=split_by_punct) + self._tokenizer = SPMTokenizer(vocab_file, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs) @property def vocab_size(self): @@ -127,7 +145,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer): vocab.update(self.get_added_vocab()) return vocab - def _tokenize(self, text): + def _tokenize(self, text: str) -> List[str]: """Take as input a string and return a list of strings (tokens) for words/sub-words""" if self.do_lower_case: text = text.lower() @@ -234,10 +252,34 @@ class DebertaV2Tokenizer(PreTrainedTokenizer): class SPMTokenizer: - def __init__(self, vocab_file, split_by_punct=False): + r""" + Constructs a tokenizer based on `SentencePiece `__. + + Args: + vocab_file (:obj:`str`): + `SentencePiece `__ file (generally has a `.spm` extension) that + contains the vocabulary necessary to instantiate a tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + """ + + def __init__(self, vocab_file, split_by_punct=False, sp_model_kwargs: Optional[Dict[str, Any]] = None): self.split_by_punct = split_by_punct self.vocab_file = vocab_file - spm = sp.SentencePieceProcessor() + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + spm = sp.SentencePieceProcessor(**self.sp_model_kwargs) assert os.path.exists(vocab_file) spm.load(vocab_file) bpe_vocab_size = spm.GetPieceSize() @@ -261,7 +303,12 @@ class SPMTokenizer: def __setstate__(self, d): self.__dict__ = d - self.spm = sp.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs) self.spm.Load(self.vocab_file) def tokenize(self, text): @@ -344,10 +391,10 @@ class SPMTokenizer: text = convert_to_unicode(text) if self.split_by_punct: words = self._run_split_on_punc(text) - pieces = [self.spm.encode_as_pieces(w) for w in words] + pieces = [self.spm.encode(w, out_type=str) for w in words] return [p for w in pieces for p in w] else: - return self.spm.encode_as_pieces(text) + return self.spm.encode(text, out_type=str) def split_to_words(self, text): pieces = self._encode_as_pieces(text) diff --git a/src/transformers/models/m2m_100/tokenization_m2m_100.py b/src/transformers/models/m2m_100/tokenization_m2m_100.py index e39fbbd7aa..93663cd4a6 100644 --- a/src/transformers/models/m2m_100/tokenization_m2m_100.py +++ b/src/transformers/models/m2m_100/tokenization_m2m_100.py @@ -16,7 +16,7 @@ import json from contextlib import contextmanager from pathlib import Path from shutil import copyfile -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import sentencepiece @@ -86,6 +86,20 @@ class M2M100Tokenizer(PreTrainedTokenizer): token instead. pad_token (:obj:`str`, `optional`, defaults to :obj:`""`): The token used for padding, for example when batching sequences of different lengths. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Examples:: @@ -118,8 +132,11 @@ class M2M100Tokenizer(PreTrainedTokenizer): sep_token="", pad_token="", unk_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs, - ): + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( src_lang=src_lang, tgt_lang=tgt_lang, @@ -128,6 +145,7 @@ class M2M100Tokenizer(PreTrainedTokenizer): sep_token=sep_token, unk_token=unk_token, pad_token=pad_token, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) @@ -135,7 +153,7 @@ class M2M100Tokenizer(PreTrainedTokenizer): self.encoder = load_json(vocab_file) self.decoder = {v: k for k, v in self.encoder.items()} self.spm_file = spm_file - self.sp_model = load_spm(spm_file) + self.sp_model = load_spm(spm_file, self.sp_model_kwargs) self.encoder_size = len(self.encoder) @@ -169,7 +187,7 @@ class M2M100Tokenizer(PreTrainedTokenizer): self.set_src_lang_special_tokens(self._src_lang) def _tokenize(self, text: str) -> List[str]: - return self.sp_model.EncodeAsPieces(text) + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): if token in self.lang_token_to_id: @@ -256,7 +274,12 @@ class M2M100Tokenizer(PreTrainedTokenizer): def __setstate__(self, d: Dict) -> None: self.__dict__ = d - self.sp_model = load_spm(self.spm_file) + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs) def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: save_dir = Path(save_directory) @@ -330,8 +353,8 @@ class M2M100Tokenizer(PreTrainedTokenizer): return self.lang_token_to_id[lang_token] -def load_spm(path: str) -> sentencepiece.SentencePieceProcessor: - spm = sentencepiece.SentencePieceProcessor() +def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor: + spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs) spm.Load(str(path)) return spm diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py index 13453f0b58..828afd53b9 100644 --- a/src/transformers/models/marian/tokenization_marian.py +++ b/src/transformers/models/marian/tokenization_marian.py @@ -18,7 +18,7 @@ import warnings from contextlib import contextmanager from pathlib import Path from shutil import copyfile -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import sentencepiece @@ -82,6 +82,20 @@ class MarianTokenizer(PreTrainedTokenizer): The maximum sentence length the model accepts. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["", ""]`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Examples:: @@ -115,8 +129,11 @@ class MarianTokenizer(PreTrainedTokenizer): eos_token="", pad_token="", model_max_length=512, + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( # bos_token=bos_token, unused. Start decoding with config.decoder_start_token_id source_lang=source_lang, @@ -125,6 +142,7 @@ class MarianTokenizer(PreTrainedTokenizer): eos_token=eos_token, pad_token=pad_token, model_max_length=model_max_length, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) assert Path(source_spm).exists(), f"cannot find spm source {source_spm}" @@ -140,8 +158,8 @@ class MarianTokenizer(PreTrainedTokenizer): self.spm_files = [source_spm, target_spm] # load SentencePiece model for pre-processing - self.spm_source = load_spm(source_spm) - self.spm_target = load_spm(target_spm) + self.spm_source = load_spm(source_spm, self.sp_model_kwargs) + self.spm_target = load_spm(target_spm, self.sp_model_kwargs) self.current_spm = self.spm_source # Multilingual target side: default to using first supported language code. @@ -172,7 +190,7 @@ class MarianTokenizer(PreTrainedTokenizer): def _tokenize(self, text: str) -> List[str]: code, text = self.remove_language_code(text) - pieces = self.current_spm.EncodeAsPieces(text) + pieces = self.current_spm.encode(text, out_type=str) return code + pieces def _convert_id_to_token(self, index: int) -> str: @@ -283,7 +301,12 @@ class MarianTokenizer(PreTrainedTokenizer): def __setstate__(self, d: Dict) -> None: self.__dict__ = d - self.spm_source, self.spm_target = (load_spm(f) for f in self.spm_files) + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.spm_source, self.spm_target = (load_spm(f, self.sp_model_kwargs) for f in self.spm_files) self.current_spm = self.spm_source self._setup_normalizer() @@ -308,8 +331,8 @@ class MarianTokenizer(PreTrainedTokenizer): return self._special_token_mask(token_ids_0 + token_ids_1) + [1] -def load_spm(path: str) -> sentencepiece.SentencePieceProcessor: - spm = sentencepiece.SentencePieceProcessor() +def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor: + spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs) spm.Load(path) return spm diff --git a/src/transformers/models/mbart/tokenization_mbart50.py b/src/transformers/models/mbart/tokenization_mbart50.py index ef7ec88f24..6c11f2ab06 100644 --- a/src/transformers/models/mbart/tokenization_mbart50.py +++ b/src/transformers/models/mbart/tokenization_mbart50.py @@ -16,7 +16,7 @@ import os from contextlib import contextmanager from shutil import copyfile -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -76,6 +76,20 @@ class MBart50Tokenizer(PreTrainedTokenizer): mask_token (:obj:`str`, `optional`, defaults to :obj:`""`): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Examples:: @@ -108,11 +122,14 @@ class MBart50Tokenizer(PreTrainedTokenizer): unk_token="", pad_token="", mask_token="", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( src_lang=src_lang, tgt_lang=tgt_lang, @@ -122,10 +139,11 @@ class MBart50Tokenizer(PreTrainedTokenizer): cls_token=cls_token, pad_token=pad_token, mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file @@ -177,7 +195,12 @@ class MBart50Tokenizer(PreTrainedTokenizer): def __setstate__(self, d: Dict) -> None: self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) def get_vocab(self) -> Dict: @@ -186,7 +209,7 @@ class MBart50Tokenizer(PreTrainedTokenizer): return vocab def _tokenize(self, text: str) -> List[str]: - return self.sp_model.EncodeAsPieces(text) + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token: str) -> int: """Converts a token (str) in an id using the vocab.""" diff --git a/src/transformers/models/pegasus/tokenization_pegasus.py b/src/transformers/models/pegasus/tokenization_pegasus.py index 74671c98e3..15f6364923 100644 --- a/src/transformers/models/pegasus/tokenization_pegasus.py +++ b/src/transformers/models/pegasus/tokenization_pegasus.py @@ -14,7 +14,7 @@ # limitations under the License. import os from shutil import copyfile -from typing import Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -77,6 +77,20 @@ class PegasusTokenizer(PreTrainedTokenizer): tokenizer `__ that uses the tokens 2 - 104 only for pretraining + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. """ vocab_files_names = VOCAB_FILES_NAMES @@ -95,10 +109,10 @@ class PegasusTokenizer(PreTrainedTokenizer): mask_token_sent="", additional_special_tokens=None, offset=103, # entries 2 - 104 are only used for pretraining + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: self.offset = offset - if additional_special_tokens is not None: assert isinstance( additional_special_tokens, list @@ -123,6 +137,8 @@ class PegasusTokenizer(PreTrainedTokenizer): additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else [] additional_special_tokens += [f"" for i in range(2, self.offset)] + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( eos_token=eos_token, unk_token=unk_token, @@ -131,11 +147,12 @@ class PegasusTokenizer(PreTrainedTokenizer): mask_token_sent=mask_token_sent, offset=offset, additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.mask_token_sent = mask_token_sent self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) # add special tokens to encoder dict @@ -175,16 +192,17 @@ class PegasusTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) - def _tokenize(self, text, sample=False): + def _tokenize(self, text: str) -> List[str]: """Take as input a string and return a list of strings (tokens) for words/sub-words""" - if not sample: - pieces = self.sp_model.EncodeAsPieces(text) - else: - pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) - return pieces + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token: str) -> int: """Converts a token (str) to an id using the vocab.""" diff --git a/src/transformers/models/reformer/tokenization_reformer.py b/src/transformers/models/reformer/tokenization_reformer.py index 535a93a31a..c816e73a7a 100644 --- a/src/transformers/models/reformer/tokenization_reformer.py +++ b/src/transformers/models/reformer/tokenization_reformer.py @@ -17,7 +17,7 @@ import os from shutil import copyfile -from typing import Dict, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -68,6 +68,20 @@ class ReformerTokenizer(PreTrainedTokenizer): The token used for padding, for example when batching sequences of different lengths. additional_special_tokens (:obj:`List[str]`, `optional`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. """ vocab_files_names = VOCAB_FILES_NAMES @@ -75,16 +89,27 @@ class ReformerTokenizer(PreTrainedTokenizer): max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES model_input_names = ["input_ids", "attention_mask"] - def __init__(self, vocab_file, eos_token="", unk_token="", additional_special_tokens=[], **kwargs): + def __init__( + self, + vocab_file, + eos_token="", + unk_token="", + additional_special_tokens=[], + sp_model_kwargs: Optional[Dict[str, Any]] = None, + **kwargs + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( eos_token=eos_token, unk_token=unk_token, additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) @property @@ -103,16 +128,17 @@ class ReformerTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) - def _tokenize(self, text, sample=False): + def _tokenize(self, text: str) -> List[str]: """Take as input a string and return a list of strings (tokens) for words/sub-words""" - if not sample: - pieces = self.sp_model.EncodeAsPieces(text) - else: - pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) - return pieces + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" diff --git a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py index 502021d535..de7f05995c 100644 --- a/src/transformers/models/speech_to_text/tokenization_speech_to_text.py +++ b/src/transformers/models/speech_to_text/tokenization_speech_to_text.py @@ -17,7 +17,7 @@ import json from pathlib import Path from shutil import copyfile -from typing import Dict, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import sentencepiece @@ -79,6 +79,21 @@ class Speech2TextTokenizer(PreTrainedTokenizer): Whether or not to lowercase the input when tokenizing. tgt_lang (:obj:`str`, `optional`): A string representing the target language. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. + **kwargs Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer` """ @@ -102,8 +117,11 @@ class Speech2TextTokenizer(PreTrainedTokenizer): do_lower_case=False, tgt_lang=None, lang_codes=None, + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs, - ): + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -113,6 +131,7 @@ class Speech2TextTokenizer(PreTrainedTokenizer): do_lower_case=do_lower_case, tgt_lang=tgt_lang, lang_codes=lang_codes, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.do_upper_case = do_upper_case @@ -121,7 +140,7 @@ class Speech2TextTokenizer(PreTrainedTokenizer): self.encoder = load_json(vocab_file) self.decoder = {v: k for k, v in self.encoder.items()} self.spm_file = spm_file - self.sp_model = load_spm(spm_file) + self.sp_model = load_spm(spm_file, self.sp_model_kwargs) if lang_codes is not None: self.lang_codes = lang_codes @@ -155,7 +174,7 @@ class Speech2TextTokenizer(PreTrainedTokenizer): self.prefix_tokens = [lang_code_id] def _tokenize(self, text: str) -> List[str]: - return self.sp_model.EncodeAsPieces(text) + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): return self.encoder.get(token, self.encoder[self.unk_token]) @@ -221,7 +240,12 @@ class Speech2TextTokenizer(PreTrainedTokenizer): def __setstate__(self, d: Dict) -> None: self.__dict__ = d - self.sp_model = load_spm(self.spm_file) + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs) def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: save_dir = Path(save_directory) @@ -241,8 +265,8 @@ class Speech2TextTokenizer(PreTrainedTokenizer): return (str(vocab_save_path), str(spm_save_path)) -def load_spm(path: str) -> sentencepiece.SentencePieceProcessor: - spm = sentencepiece.SentencePieceProcessor() +def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor: + spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs) spm.Load(str(path)) return spm diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index 949aba04eb..6daf19d4c8 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -19,7 +19,7 @@ import os import re import warnings from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -81,6 +81,20 @@ class T5Tokenizer(PreTrainedTokenizer): `__). additional_special_tokens (:obj:`List[str]`, `optional`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Attributes: sp_model (:obj:`SentencePieceProcessor`): @@ -100,8 +114,9 @@ class T5Tokenizer(PreTrainedTokenizer): pad_token="", extra_ids=100, additional_special_tokens=None, + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: # Add extra_ids to the special token list if extra_ids > 0 and additional_special_tokens is None: additional_special_tokens = [f"" for i in range(extra_ids)] @@ -114,19 +129,22 @@ class T5Tokenizer(PreTrainedTokenizer): "In this case the additional_special_tokens must include the extra_ids tokens" ) + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( eos_token=eos_token, unk_token=unk_token, pad_token=pad_token, extra_ids=extra_ids, additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) self.vocab_file = vocab_file self._extra_ids = extra_ids - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) @property @@ -231,16 +249,17 @@ class T5Tokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) - def _tokenize(self, text, sample=False): + def _tokenize(self, text: str) -> List[str]: """Take as input a string and return a list of strings (tokens) for words/sub-words""" - if not sample: - pieces = self.sp_model.EncodeAsPieces(text) - else: - pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) - return pieces + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py index b2707f8dcb..c0c8e90c5e 100644 --- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py +++ b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py @@ -16,7 +16,7 @@ import collections import os from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple from ...tokenization_utils import PreTrainedTokenizer from ...utils import logging @@ -96,6 +96,20 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer): modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Attributes: sp_model (:obj:`SentencePieceProcessor`): @@ -117,8 +131,11 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer): pad_token="[PAD]", cls_token="[CLS]", mask_token="[MASK]", + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( bos_token=bos_token, eos_token=eos_token, @@ -127,6 +144,7 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer): pad_token=pad_token, cls_token=cls_token, mask_token=mask_token, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) @@ -139,7 +157,7 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer): ) raise - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(str(vocab_file)) self.vocab_file = vocab_file @@ -177,7 +195,12 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer): "pip install sentencepiece" ) raise - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) def get_special_tokens_mask( @@ -241,8 +264,8 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer): vocab.update(self.added_tokens_encoder) return vocab - def _tokenize(self, text): - return self.sp_model.EncodeAsPieces(text) + def _tokenize(self, text: str) -> str: + return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): """Converts a token (str) in an id using the vocab.""" diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py index 9241c4f470..564f6e50a6 100644 --- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py @@ -17,7 +17,7 @@ import os from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -94,7 +94,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. - sp_model_kwargs (:obj:`dict`, `optional`, defaults to :obj:`None`): + sp_model_kwargs (:obj:`dict`, `optional`): Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece `__ can be used, among other things, to set: @@ -129,9 +129,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): unk_token="", pad_token="", mask_token="", - sp_model_kwargs=None, + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token @@ -271,7 +271,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): vocab.update(self.added_tokens_encoder) return vocab - def _tokenize(self, text): + def _tokenize(self, text: str) -> List[str]: return self.sp_model.encode(text, out_type=str) def _convert_token_to_id(self, token): diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py index 5137bcfee3..afd87e309c 100644 --- a/src/transformers/models/xlnet/tokenization_xlnet.py +++ b/src/transformers/models/xlnet/tokenization_xlnet.py @@ -18,7 +18,7 @@ import os import unicodedata from shutil import copyfile -from typing import List, Optional, Tuple +from typing import Any, Dict, List, Optional, Tuple import sentencepiece as spm @@ -99,6 +99,20 @@ class XLNetTokenizer(PreTrainedTokenizer): modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["", ""]`): Additional special tokens used by the tokenizer. + sp_model_kwargs (:obj:`dict`, `optional`): + Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece + `__ can be used, among other things, to set: + + - ``enable_sampling``: Enable subword regularization. + - ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout. + + - ``nbest_size = {0,1}``: No sampling is performed. + - ``nbest_size > 1``: samples from the nbest_size results. + - ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice) + using forward-filtering-and-backward-sampling algorithm. + + - ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for + BPE-dropout. Attributes: sp_model (:obj:`SentencePieceProcessor`): @@ -124,11 +138,14 @@ class XLNetTokenizer(PreTrainedTokenizer): cls_token="", mask_token="", additional_special_tokens=["", ""], + sp_model_kwargs: Optional[Dict[str, Any]] = None, **kwargs - ): + ) -> None: # Mask token behave like a normal word, i.e. include the space before it mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + super().__init__( do_lower_case=do_lower_case, remove_space=remove_space, @@ -141,6 +158,7 @@ class XLNetTokenizer(PreTrainedTokenizer): cls_token=cls_token, mask_token=mask_token, additional_special_tokens=additional_special_tokens, + sp_model_kwargs=self.sp_model_kwargs, **kwargs, ) @@ -151,7 +169,7 @@ class XLNetTokenizer(PreTrainedTokenizer): self.keep_accents = keep_accents self.vocab_file = vocab_file - self.sp_model = spm.SentencePieceProcessor() + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(vocab_file) @property @@ -170,7 +188,12 @@ class XLNetTokenizer(PreTrainedTokenizer): def __setstate__(self, d): self.__dict__ = d - self.sp_model = spm.SentencePieceProcessor() + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) self.sp_model.Load(self.vocab_file) def preprocess_text(self, inputs): @@ -188,14 +211,10 @@ class XLNetTokenizer(PreTrainedTokenizer): return outputs - def _tokenize(self, text, sample=False): + def _tokenize(self, text: str) -> List[str]: """Tokenize a string.""" text = self.preprocess_text(text) - - if not sample: - pieces = self.sp_model.EncodeAsPieces(text) - else: - pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1) + pieces = self.sp_model.encode(text, out_type=str) new_pieces = [] for piece in pieces: if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): diff --git a/tests/test_tokenization_albert.py b/tests/test_tokenization_albert.py index 16596524b0..465fa71d76 100644 --- a/tests/test_tokenization_albert.py +++ b/tests/test_tokenization_albert.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -33,6 +32,8 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = AlbertTokenizer rust_tokenizer_class = AlbertTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True + test_sentencepiece_ignore_case = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_barthez.py b/tests/test_tokenization_barthez.py index 1c3a3d18ef..e3ba4df9b1 100644 --- a/tests/test_tokenization_barthez.py +++ b/tests/test_tokenization_barthez.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import unittest from transformers import BarthezTokenizer, BarthezTokenizerFast, BatchEncoding @@ -24,12 +23,13 @@ from .test_tokenization_common import TokenizerTesterMixin @require_tokenizers @require_sentencepiece -@slow +@slow # see https://github.com/huggingface/transformers/issues/11457 class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BarthezTokenizer rust_tokenizer_class = BarthezTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_bert_generation.py b/tests/test_tokenization_bert_generation.py index d1aa93715a..e540b98647 100644 --- a/tests/test_tokenization_bert_generation.py +++ b/tests/test_tokenization_bert_generation.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -33,6 +32,7 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BertGenerationTokenizer + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_big_bird.py b/tests/test_tokenization_big_bird.py index 5645eb401f..c4d700cad6 100644 --- a/tests/test_tokenization_big_bird.py +++ b/tests/test_tokenization_big_bird.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -36,11 +35,12 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = BigBirdTokenizer rust_tokenizer_class = BigBirdTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() - tokenizer = BigBirdTokenizer(SAMPLE_VOCAB, keep_accents=True) + tokenizer = self.tokenizer_class(SAMPLE_VOCAB, keep_accents=True) tokenizer.save_pretrained(self.tmpdirname) def test_rust_and_python_full_tokenizers(self): diff --git a/tests/test_tokenization_camembert.py b/tests/test_tokenization_camembert.py index 4dc1c88de1..29faec4925 100644 --- a/tests/test_tokenization_camembert.py +++ b/tests/test_tokenization_camembert.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -37,6 +36,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = CamembertTokenizer rust_tokenizer_class = CamembertTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 25213e447c..c8b4bbc21e 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -15,6 +15,7 @@ import inspect +import itertools import os import pickle import re @@ -100,6 +101,13 @@ class TokenizerTesterMixin: from_pretrained_vocab_key = "vocab_file" test_seq2seq = True + # set to True to test a sentencepiece tokenizer + test_sentencepiece = False + + # set to True to ignore casing when testing a sentencepiece tokenizer + # test_sentencepiece must also be set to True + test_sentencepiece_ignore_case = False + def setUp(self) -> None: # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # information available in Tokenizer (name, rust class, python class, vocab key name) @@ -216,6 +224,38 @@ class TokenizerTesterMixin: for i in range(len(batch_encode_plus_sequences["input_ids"])) ] + def test_subword_regularization_tokenizer(self) -> None: + if not self.test_sentencepiece: + return + + # Subword regularization is only available for the slow tokenizer. + sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1} + tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs) + + self.assertTrue(hasattr(tokenizer, "sp_model_kwargs")) + self.assertIsNotNone(tokenizer.sp_model_kwargs) + self.assertTrue(isinstance(tokenizer.sp_model_kwargs, dict)) + self.assertEqual(tokenizer.sp_model_kwargs, sp_model_kwargs) + self.check_subword_sampling(tokenizer) + + def test_pickle_subword_regularization_tokenizer(self) -> None: + if not self.test_sentencepiece: + return + + """Google pickle __getstate__ __setstate__ if you are struggling with this.""" + # Subword regularization is only available for the slow tokenizer. + sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1} + tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs) + tokenizer_bin = pickle.dumps(tokenizer) + del tokenizer + tokenizer_new = pickle.loads(tokenizer_bin) + + self.assertTrue(hasattr(tokenizer_new, "sp_model_kwargs")) + self.assertIsNotNone(tokenizer_new.sp_model_kwargs) + self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict)) + self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs) + self.check_subword_sampling(tokenizer_new) + def test_model_input_names_signature(self): accepted_model_main_input_names = [ "input_ids", # nlp models @@ -1727,6 +1767,46 @@ class TokenizerTesterMixin: # add pad_token_id to pass subsequent tests tokenizer.add_special_tokens({"pad_token": ""}) + def check_subword_sampling( + self, + tokenizer: PreTrainedTokenizer, + text: str = None, + ) -> None: + """ + Check if the tokenizer generates different results when subword regularization is enabled. + + Subword regularization augments training data with subword sampling. + This has a random component. + + Args: + tokenizer: The tokenizer to check. + text: The text to use for the checks. + """ + text = "This is a test for subword regularization." if text is None else text + if self.test_sentencepiece_ignore_case: + text = text.lower() + + tokens_list = [] + for _ in range(5): + tokens_list.append(tokenizer.tokenize(text)) + + # the list of different pairs of tokens_list + combinations = itertools.combinations(tokens_list, 2) + + # check of sampling is done + subword_sampling_found = False + for combination in combinations: + if combination[0] != combination[1]: + subword_sampling_found = True + self.assertTrue(subword_sampling_found) + + # check if converting back to original text works + for tokens in tokens_list: + if self.test_sentencepiece_ignore_case: + self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower()) + else: + self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens)) + @require_torch @slow def test_torch_encode_plus_sent_to_model(self): diff --git a/tests/test_tokenization_deberta_v2.py b/tests/test_tokenization_deberta_v2.py index 2fdf74d003..fbc1c2d10d 100644 --- a/tests/test_tokenization_deberta_v2.py +++ b/tests/test_tokenization_deberta_v2.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -33,6 +32,8 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = DebertaV2Tokenizer rust_tokenizer_class = None test_rust_tokenizer = False + test_sentencepiece = True + test_sentencepiece_ignore_case = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_m2m_100.py b/tests/test_tokenization_m2m_100.py index 4f7cf6ffae..b151625eeb 100644 --- a/tests/test_tokenization_m2m_100.py +++ b/tests/test_tokenization_m2m_100.py @@ -45,6 +45,7 @@ class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = M2M100Tokenizer test_rust_tokenizer = False test_seq2seq = False + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_marian.py b/tests/test_tokenization_marian.py index 3d9146b11f..f3986d9c72 100644 --- a/tests/test_tokenization_marian.py +++ b/tests/test_tokenization_marian.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import tempfile import unittest @@ -50,6 +49,7 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = MarianTokenizer test_rust_tokenizer = False + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_mbart50.py b/tests/test_tokenization_mbart50.py index 49dfc0b66f..5d0c4362d3 100644 --- a/tests/test_tokenization_mbart50.py +++ b/tests/test_tokenization_mbart50.py @@ -38,6 +38,7 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = MBart50Tokenizer rust_tokenizer_class = MBart50TokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_pegasus.py b/tests/test_tokenization_pegasus.py index 0db2d34cd7..8b15b339c4 100644 --- a/tests/test_tokenization_pegasus.py +++ b/tests/test_tokenization_pegasus.py @@ -31,6 +31,7 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = PegasusTokenizer rust_tokenizer_class = PegasusTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() @@ -104,6 +105,7 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = PegasusTokenizer rust_tokenizer_class = PegasusTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_reformer.py b/tests/test_tokenization_reformer.py index 179cf9bcd1..1729ba8d9d 100644 --- a/tests/test_tokenization_reformer.py +++ b/tests/test_tokenization_reformer.py @@ -12,7 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -34,6 +33,7 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase): rust_tokenizer_class = ReformerTokenizerFast test_rust_tokenizer = True test_seq2seq = False + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_speech_to_text.py b/tests/test_tokenization_speech_to_text.py index 2a42b04a50..08a7150388 100644 --- a/tests/test_tokenization_speech_to_text.py +++ b/tests/test_tokenization_speech_to_text.py @@ -40,6 +40,7 @@ ES_CODE = 10 class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = Speech2TextTokenizer test_rust_tokenizer = False + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_t5.py b/tests/test_tokenization_t5.py index 26d8317b5a..be64acf083 100644 --- a/tests/test_tokenization_t5.py +++ b/tests/test_tokenization_t5.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import unittest from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast @@ -40,6 +39,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = T5Tokenizer rust_tokenizer_class = T5TokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_xlm_prophetnet.py b/tests/test_tokenization_xlm_prophetnet.py index dd426547ac..771bb8c6d3 100644 --- a/tests/test_tokenization_xlm_prophetnet.py +++ b/tests/test_tokenization_xlm_prophetnet.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -32,6 +31,7 @@ class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XLMProphetNetTokenizer test_rust_tokenizer = False + test_sentencepiece = True def setUp(self): super().setUp() diff --git a/tests/test_tokenization_xlm_roberta.py b/tests/test_tokenization_xlm_roberta.py index b9fe4dde62..816ad17925 100644 --- a/tests/test_tokenization_xlm_roberta.py +++ b/tests/test_tokenization_xlm_roberta.py @@ -13,10 +13,7 @@ # See the License for the specific language governing permissions and # limitations under the License. - -import itertools import os -import pickle import unittest from transformers import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast @@ -36,6 +33,7 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XLMRobertaTokenizer rust_tokenizer_class = XLMRobertaTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp() @@ -120,41 +118,6 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): ], ) - def test_subword_regularization_tokenizer(self): - # Subword regularization is only available for the slow tokenizer. - tokenizer = XLMRobertaTokenizer( - SAMPLE_VOCAB, keep_accents=True, sp_model_kwargs={"enable_sampling": True, "alpha": 0.1, "nbest_size": -1} - ) - - # Subword regularization augments training data with subword sampling. - # This has a random component. We test if the tokenizer generates different - # results when subword regularization is enabled. - tokens_list = [] - for _ in range(5): - tokens_list.append(tokenizer.tokenize("This is a test for subword regularization.")) - - # the list of different pairs of tokens_list - combinations = itertools.combinations(tokens_list, 2) - - all_equal = True - for combination in combinations: - if combination[0] != combination[1]: - all_equal = False - - self.assertFalse(all_equal) - - def test_pickle_subword_regularization_tokenizer(self): - """Google pickle __getstate__ __setstate__ if you are struggling with this.""" - # Subword regularization is only available for the slow tokenizer. - sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1} - tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB, keep_accents=True, sp_model_kwargs=sp_model_kwargs) - tokenizer_bin = pickle.dumps(tokenizer) - tokenizer_new = pickle.loads(tokenizer_bin) - - self.assertIsNotNone(tokenizer_new.sp_model_kwargs) - self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict)) - self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs) - @cached_property def big_tokenizer(self): return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base") diff --git a/tests/test_tokenization_xlnet.py b/tests/test_tokenization_xlnet.py index fb018ec5c2..c7168b38c5 100644 --- a/tests/test_tokenization_xlnet.py +++ b/tests/test_tokenization_xlnet.py @@ -13,7 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - import os import unittest @@ -33,6 +32,7 @@ class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = XLNetTokenizer rust_tokenizer_class = XLNetTokenizerFast test_rust_tokenizer = True + test_sentencepiece = True def setUp(self): super().setUp()