Enable option for subword regularization in more tokenizers. (#11417)

* improve slow class tok usage at xlm rob

* add subword regularization for barthez

* improve barthez tok. test

* fix tokenizer tests

* add subword regularization for camembert

* add subword regularization for deberta v2 tokenizer

* add more doc to deberta v2 tokenizer

* add subword regularization for speech to text tok.

* fix sp_model_kwargs type in speech 2 text tok.

* add subword regularization for M2M100 tok.

* add more concrete type hints

* fix tests for m2m100 and s2t tok.

* add missing Any import

* fix syntax error in m2m100 tok.

* fix unpickle of m2m100 and s2t tok.

* fix test of m2m100 and s2t tok.

* improve unpickle of deberta v2 tok.

* add test for pickle of barthez & camembert

* fix pickle of barthez & camembert

* add test for deberta v2 tok. pickle

* fix m2m100 tok. pickle

* fix s2t tok. pickle

* add subword regularization to albert tok.

* refactor subword reg. test into TokenizerTesterMixin

improve albert tok. test

remove sample argument form albert tok.

check subword reg. using TokenizerTesterMixin

improve tok. tests

improve xlm roberta tok. tests

improve xlm roberta tok. tests

* add subword regularization for big bird t.

* improve xlm roberta tok. test

* add subword regularization for mbart50 tok.

* add subword regularization for pegasus tok.

* add subword regularization for reformer tok.

* add subword regularization for T5 tok.

* fix t5 tok. test formatting

* add subword regularization for xlm_proph. tok.

* add subword regularization for xlnet tok.

* add subword regularization for gert_gen tok.

* add typing to tokenizers

* add typing to xlm rob. tok

* add subword regularization for marian tok.

* add reverse tok. test

* fix marian tok test

* fix marian tok test

* fix casing in tok. tests

* fix style of tok. common test

* fix deberta v2 tok test

* add type annotations to tok. tests

* add type annotations to tok. __init__

* add typing to kokenizer

* add type annotations to tok. __init__

* don't specify the default when it's None

* fix barthez tok. doc

* move sentencepiece tok. tests to TokenizerTesterMixin

* fix unused imports

* fix albert tok. test

* add comment to sentencepiece test options

* fix Any import at big bird tok.

* fix Any import at xlm prophetnet tok.

* empty commit to trigger CI
This commit is contained in:
Philip May
2021-05-13 08:44:55 +02:00
committed by GitHub
parent fa84540e98
commit 37ed3ab719
33 changed files with 578 additions and 181 deletions

View File

@@ -18,7 +18,7 @@
import os import os
import unicodedata import unicodedata
from shutil import copyfile from shutil import copyfile
from typing import List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm import sentencepiece as spm
@@ -102,6 +102,20 @@ class AlbertTokenizer(PreTrainedTokenizer):
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes: Attributes:
sp_model (:obj:`SentencePieceProcessor`): sp_model (:obj:`SentencePieceProcessor`):
@@ -125,11 +139,14 @@ class AlbertTokenizer(PreTrainedTokenizer):
pad_token="<pad>", pad_token="<pad>",
cls_token="[CLS]", cls_token="[CLS]",
mask_token="[MASK]", mask_token="[MASK]",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs **kwargs
): ) -> None:
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__( super().__init__(
do_lower_case=do_lower_case, do_lower_case=do_lower_case,
remove_space=remove_space, remove_space=remove_space,
@@ -141,6 +158,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
pad_token=pad_token, pad_token=pad_token,
cls_token=cls_token, cls_token=cls_token,
mask_token=mask_token, mask_token=mask_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
@@ -149,7 +167,7 @@ class AlbertTokenizer(PreTrainedTokenizer):
self.keep_accents = keep_accents self.keep_accents = keep_accents
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor() self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file) self.sp_model.Load(vocab_file)
@property @property
@@ -168,7 +186,12 @@ class AlbertTokenizer(PreTrainedTokenizer):
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__ = d self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor()
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def preprocess_text(self, inputs): def preprocess_text(self, inputs):
@@ -186,14 +209,10 @@ class AlbertTokenizer(PreTrainedTokenizer):
return outputs return outputs
def _tokenize(self, text, sample=False): def _tokenize(self, text: str) -> List[str]:
"""Tokenize a string.""" """Tokenize a string."""
text = self.preprocess_text(text) text = self.preprocess_text(text)
pieces = self.sp_model.encode(text, out_type=str)
if not sample:
pieces = self.sp_model.EncodeAsPieces(text)
else:
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
new_pieces = [] new_pieces = []
for piece in pieces: for piece in pieces:
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():

View File

@@ -17,7 +17,7 @@
import os import os
from shutil import copyfile from shutil import copyfile
from typing import List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm import sentencepiece as spm
@@ -89,6 +89,20 @@ class BarthezTokenizer(PreTrainedTokenizer):
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes: Attributes:
sp_model (:obj:`SentencePieceProcessor`): sp_model (:obj:`SentencePieceProcessor`):
@@ -110,11 +124,14 @@ class BarthezTokenizer(PreTrainedTokenizer):
unk_token="<unk>", unk_token="<unk>",
pad_token="<pad>", pad_token="<pad>",
mask_token="<mask>", mask_token="<mask>",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs **kwargs
): ) -> None:
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__( super().__init__(
bos_token=bos_token, bos_token=bos_token,
eos_token=eos_token, eos_token=eos_token,
@@ -123,11 +140,12 @@ class BarthezTokenizer(PreTrainedTokenizer):
cls_token=cls_token, cls_token=cls_token,
pad_token=pad_token, pad_token=pad_token,
mask_token=mask_token, mask_token=mask_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor() self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file)) self.sp_model.Load(str(vocab_file))
self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3} self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
@@ -219,8 +237,8 @@ class BarthezTokenizer(PreTrainedTokenizer):
vocab.update(self.added_tokens_encoder) vocab.update(self.added_tokens_encoder)
return vocab return vocab
def _tokenize(self, text): def _tokenize(self, text: str) -> List[str]:
return self.sp_model.EncodeAsPieces(text) return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab.""" """Converts a token (str) in an id using the vocab."""
@@ -243,7 +261,12 @@ class BarthezTokenizer(PreTrainedTokenizer):
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__ = d self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor()
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):

View File

@@ -17,7 +17,7 @@
import os import os
from shutil import copyfile from shutil import copyfile
from typing import List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm import sentencepiece as spm
@@ -58,6 +58,20 @@ class BertGenerationTokenizer(PreTrainedTokenizer):
token instead. token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`): pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths. The token used for padding, for example when batching sequences of different lengths.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -74,8 +88,11 @@ class BertGenerationTokenizer(PreTrainedTokenizer):
unk_token="<unk>", unk_token="<unk>",
pad_token="<pad>", pad_token="<pad>",
sep_token="<::::>", sep_token="<::::>",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs **kwargs
): ) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
# Add extra_ids to the special token list # Add extra_ids to the special token list
super().__init__( super().__init__(
bos_token=bos_token, bos_token=bos_token,
@@ -83,12 +100,13 @@ class BertGenerationTokenizer(PreTrainedTokenizer):
unk_token=unk_token, unk_token=unk_token,
pad_token=pad_token, pad_token=pad_token,
sep_token=sep_token, sep_token=sep_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor() self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file) self.sp_model.Load(vocab_file)
@property @property
@@ -107,16 +125,17 @@ class BertGenerationTokenizer(PreTrainedTokenizer):
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__ = d self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor()
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def _tokenize(self, text, sample=False): def _tokenize(self, text: str) -> List[str]:
"""Take as input a string and return a list of strings (tokens) for words/sub-words""" """Take as input a string and return a list of strings (tokens) for words/sub-words"""
if not sample: return self.sp_model.encode(text, out_type=str)
pieces = self.sp_model.EncodeAsPieces(text)
else:
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
return pieces
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab.""" """Converts a token (str) in an id using the vocab."""

View File

@@ -17,7 +17,7 @@
import os import os
from shutil import copyfile from shutil import copyfile
from typing import List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm import sentencepiece as spm
@@ -74,7 +74,20 @@ class BigBirdTokenizer(PreTrainedTokenizer):
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -93,8 +106,9 @@ class BigBirdTokenizer(PreTrainedTokenizer):
sep_token="[SEP]", sep_token="[SEP]",
mask_token="[MASK]", mask_token="[MASK]",
cls_token="[CLS]", cls_token="[CLS]",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs **kwargs
): ) -> None:
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
@@ -105,6 +119,8 @@ class BigBirdTokenizer(PreTrainedTokenizer):
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__( super().__init__(
bos_token=bos_token, bos_token=bos_token,
eos_token=eos_token, eos_token=eos_token,
@@ -113,12 +129,13 @@ class BigBirdTokenizer(PreTrainedTokenizer):
sep_token=sep_token, sep_token=sep_token,
mask_token=mask_token, mask_token=mask_token,
cls_token=cls_token, cls_token=cls_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor() self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file) self.sp_model.Load(vocab_file)
@property @property
@@ -137,16 +154,17 @@ class BigBirdTokenizer(PreTrainedTokenizer):
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__ = d self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor()
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def _tokenize(self, text, sample=False): def _tokenize(self, text: str) -> List[str]:
"""Take as input a string and return a list of strings (tokens) for words/sub-words""" """Take as input a string and return a list of strings (tokens) for words/sub-words"""
if not sample: return self.sp_model.encode(text, out_type=str)
pieces = self.sp_model.EncodeAsPieces(text)
else:
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
return pieces
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab.""" """Converts a token (str) in an id using the vocab."""

View File

@@ -17,7 +17,7 @@
import os import os
from shutil import copyfile from shutil import copyfile
from typing import List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm import sentencepiece as spm
@@ -85,6 +85,20 @@ class CamembertTokenizer(PreTrainedTokenizer):
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes: Attributes:
sp_model (:obj:`SentencePieceProcessor`): sp_model (:obj:`SentencePieceProcessor`):
@@ -107,11 +121,14 @@ class CamembertTokenizer(PreTrainedTokenizer):
pad_token="<pad>", pad_token="<pad>",
mask_token="<mask>", mask_token="<mask>",
additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"], additional_special_tokens=["<s>NOTUSED", "</s>NOTUSED"],
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs **kwargs
): ) -> None:
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__( super().__init__(
bos_token=bos_token, bos_token=bos_token,
eos_token=eos_token, eos_token=eos_token,
@@ -121,9 +138,10 @@ class CamembertTokenizer(PreTrainedTokenizer):
pad_token=pad_token, pad_token=pad_token,
mask_token=mask_token, mask_token=mask_token,
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
self.sp_model = spm.SentencePieceProcessor() self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file)) self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file self.vocab_file = vocab_file
# HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual # HACK: These tokens were added by fairseq but don't seem to be actually used when duplicated in the actual
@@ -218,8 +236,8 @@ class CamembertTokenizer(PreTrainedTokenizer):
vocab.update(self.added_tokens_encoder) vocab.update(self.added_tokens_encoder)
return vocab return vocab
def _tokenize(self, text): def _tokenize(self, text: str) -> List[str]:
return self.sp_model.EncodeAsPieces(text) return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab.""" """Converts a token (str) in an id using the vocab."""
@@ -243,7 +261,12 @@ class CamembertTokenizer(PreTrainedTokenizer):
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__ = d self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor()
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def convert_tokens_to_string(self, tokens): def convert_tokens_to_string(self, tokens):

View File

@@ -16,7 +16,7 @@
import os import os
import unicodedata import unicodedata
from typing import Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as sp import sentencepiece as sp
import six import six
@@ -75,6 +75,20 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -92,8 +106,11 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
pad_token="[PAD]", pad_token="[PAD]",
cls_token="[CLS]", cls_token="[CLS]",
mask_token="[MASK]", mask_token="[MASK]",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs **kwargs
): ) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__( super().__init__(
do_lower_case=do_lower_case, do_lower_case=do_lower_case,
unk_token=unk_token, unk_token=unk_token,
@@ -102,6 +119,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
cls_token=cls_token, cls_token=cls_token,
mask_token=mask_token, mask_token=mask_token,
split_by_punct=split_by_punct, split_by_punct=split_by_punct,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
@@ -112,7 +130,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
) )
self.do_lower_case = do_lower_case self.do_lower_case = do_lower_case
self.split_by_punct = split_by_punct self.split_by_punct = split_by_punct
self._tokenizer = SPMTokenizer(vocab_file, split_by_punct=split_by_punct) self._tokenizer = SPMTokenizer(vocab_file, split_by_punct=split_by_punct, sp_model_kwargs=self.sp_model_kwargs)
@property @property
def vocab_size(self): def vocab_size(self):
@@ -127,7 +145,7 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
vocab.update(self.get_added_vocab()) vocab.update(self.get_added_vocab())
return vocab return vocab
def _tokenize(self, text): def _tokenize(self, text: str) -> List[str]:
"""Take as input a string and return a list of strings (tokens) for words/sub-words""" """Take as input a string and return a list of strings (tokens) for words/sub-words"""
if self.do_lower_case: if self.do_lower_case:
text = text.lower() text = text.lower()
@@ -234,10 +252,34 @@ class DebertaV2Tokenizer(PreTrainedTokenizer):
class SPMTokenizer: class SPMTokenizer:
def __init__(self, vocab_file, split_by_punct=False): r"""
Constructs a tokenizer based on `SentencePiece <https://github.com/google/sentencepiece>`__.
Args:
vocab_file (:obj:`str`):
`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
contains the vocabulary necessary to instantiate a tokenizer.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
"""
def __init__(self, vocab_file, split_by_punct=False, sp_model_kwargs: Optional[Dict[str, Any]] = None):
self.split_by_punct = split_by_punct self.split_by_punct = split_by_punct
self.vocab_file = vocab_file self.vocab_file = vocab_file
spm = sp.SentencePieceProcessor() self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
assert os.path.exists(vocab_file) assert os.path.exists(vocab_file)
spm.load(vocab_file) spm.load(vocab_file)
bpe_vocab_size = spm.GetPieceSize() bpe_vocab_size = spm.GetPieceSize()
@@ -261,7 +303,12 @@ class SPMTokenizer:
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__ = d self.__dict__ = d
self.spm = sp.SentencePieceProcessor()
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.spm = sp.SentencePieceProcessor(**self.sp_model_kwargs)
self.spm.Load(self.vocab_file) self.spm.Load(self.vocab_file)
def tokenize(self, text): def tokenize(self, text):
@@ -344,10 +391,10 @@ class SPMTokenizer:
text = convert_to_unicode(text) text = convert_to_unicode(text)
if self.split_by_punct: if self.split_by_punct:
words = self._run_split_on_punc(text) words = self._run_split_on_punc(text)
pieces = [self.spm.encode_as_pieces(w) for w in words] pieces = [self.spm.encode(w, out_type=str) for w in words]
return [p for w in pieces for p in w] return [p for w in pieces for p in w]
else: else:
return self.spm.encode_as_pieces(text) return self.spm.encode(text, out_type=str)
def split_to_words(self, text): def split_to_words(self, text):
pieces = self._encode_as_pieces(text) pieces = self._encode_as_pieces(text)

View File

@@ -16,7 +16,7 @@ import json
from contextlib import contextmanager from contextlib import contextmanager
from pathlib import Path from pathlib import Path
from shutil import copyfile from shutil import copyfile
from typing import Dict, List, Optional, Tuple, Union from typing import Any, Dict, List, Optional, Tuple, Union
import sentencepiece import sentencepiece
@@ -86,6 +86,20 @@ class M2M100Tokenizer(PreTrainedTokenizer):
token instead. token instead.
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`): pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
The token used for padding, for example when batching sequences of different lengths. The token used for padding, for example when batching sequences of different lengths.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Examples:: Examples::
@@ -118,8 +132,11 @@ class M2M100Tokenizer(PreTrainedTokenizer):
sep_token="</s>", sep_token="</s>",
pad_token="<pad>", pad_token="<pad>",
unk_token="<unk>", unk_token="<unk>",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs, **kwargs,
): ) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__( super().__init__(
src_lang=src_lang, src_lang=src_lang,
tgt_lang=tgt_lang, tgt_lang=tgt_lang,
@@ -128,6 +145,7 @@ class M2M100Tokenizer(PreTrainedTokenizer):
sep_token=sep_token, sep_token=sep_token,
unk_token=unk_token, unk_token=unk_token,
pad_token=pad_token, pad_token=pad_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
@@ -135,7 +153,7 @@ class M2M100Tokenizer(PreTrainedTokenizer):
self.encoder = load_json(vocab_file) self.encoder = load_json(vocab_file)
self.decoder = {v: k for k, v in self.encoder.items()} self.decoder = {v: k for k, v in self.encoder.items()}
self.spm_file = spm_file self.spm_file = spm_file
self.sp_model = load_spm(spm_file) self.sp_model = load_spm(spm_file, self.sp_model_kwargs)
self.encoder_size = len(self.encoder) self.encoder_size = len(self.encoder)
@@ -169,7 +187,7 @@ class M2M100Tokenizer(PreTrainedTokenizer):
self.set_src_lang_special_tokens(self._src_lang) self.set_src_lang_special_tokens(self._src_lang)
def _tokenize(self, text: str) -> List[str]: def _tokenize(self, text: str) -> List[str]:
return self.sp_model.EncodeAsPieces(text) return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
if token in self.lang_token_to_id: if token in self.lang_token_to_id:
@@ -256,7 +274,12 @@ class M2M100Tokenizer(PreTrainedTokenizer):
def __setstate__(self, d: Dict) -> None: def __setstate__(self, d: Dict) -> None:
self.__dict__ = d self.__dict__ = d
self.sp_model = load_spm(self.spm_file)
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
save_dir = Path(save_directory) save_dir = Path(save_directory)
@@ -330,8 +353,8 @@ class M2M100Tokenizer(PreTrainedTokenizer):
return self.lang_token_to_id[lang_token] return self.lang_token_to_id[lang_token]
def load_spm(path: str) -> sentencepiece.SentencePieceProcessor: def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
spm = sentencepiece.SentencePieceProcessor() spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
spm.Load(str(path)) spm.Load(str(path))
return spm return spm

View File

@@ -18,7 +18,7 @@ import warnings
from contextlib import contextmanager from contextlib import contextmanager
from pathlib import Path from pathlib import Path
from shutil import copyfile from shutil import copyfile
from typing import Dict, List, Optional, Tuple, Union from typing import Any, Dict, List, Optional, Tuple, Union
import sentencepiece import sentencepiece
@@ -82,6 +82,20 @@ class MarianTokenizer(PreTrainedTokenizer):
The maximum sentence length the model accepts. The maximum sentence length the model accepts.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Examples:: Examples::
@@ -115,8 +129,11 @@ class MarianTokenizer(PreTrainedTokenizer):
eos_token="</s>", eos_token="</s>",
pad_token="<pad>", pad_token="<pad>",
model_max_length=512, model_max_length=512,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs **kwargs
): ) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__( super().__init__(
# bos_token=bos_token, unused. Start decoding with config.decoder_start_token_id # bos_token=bos_token, unused. Start decoding with config.decoder_start_token_id
source_lang=source_lang, source_lang=source_lang,
@@ -125,6 +142,7 @@ class MarianTokenizer(PreTrainedTokenizer):
eos_token=eos_token, eos_token=eos_token,
pad_token=pad_token, pad_token=pad_token,
model_max_length=model_max_length, model_max_length=model_max_length,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
assert Path(source_spm).exists(), f"cannot find spm source {source_spm}" assert Path(source_spm).exists(), f"cannot find spm source {source_spm}"
@@ -140,8 +158,8 @@ class MarianTokenizer(PreTrainedTokenizer):
self.spm_files = [source_spm, target_spm] self.spm_files = [source_spm, target_spm]
# load SentencePiece model for pre-processing # load SentencePiece model for pre-processing
self.spm_source = load_spm(source_spm) self.spm_source = load_spm(source_spm, self.sp_model_kwargs)
self.spm_target = load_spm(target_spm) self.spm_target = load_spm(target_spm, self.sp_model_kwargs)
self.current_spm = self.spm_source self.current_spm = self.spm_source
# Multilingual target side: default to using first supported language code. # Multilingual target side: default to using first supported language code.
@@ -172,7 +190,7 @@ class MarianTokenizer(PreTrainedTokenizer):
def _tokenize(self, text: str) -> List[str]: def _tokenize(self, text: str) -> List[str]:
code, text = self.remove_language_code(text) code, text = self.remove_language_code(text)
pieces = self.current_spm.EncodeAsPieces(text) pieces = self.current_spm.encode(text, out_type=str)
return code + pieces return code + pieces
def _convert_id_to_token(self, index: int) -> str: def _convert_id_to_token(self, index: int) -> str:
@@ -283,7 +301,12 @@ class MarianTokenizer(PreTrainedTokenizer):
def __setstate__(self, d: Dict) -> None: def __setstate__(self, d: Dict) -> None:
self.__dict__ = d self.__dict__ = d
self.spm_source, self.spm_target = (load_spm(f) for f in self.spm_files)
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.spm_source, self.spm_target = (load_spm(f, self.sp_model_kwargs) for f in self.spm_files)
self.current_spm = self.spm_source self.current_spm = self.spm_source
self._setup_normalizer() self._setup_normalizer()
@@ -308,8 +331,8 @@ class MarianTokenizer(PreTrainedTokenizer):
return self._special_token_mask(token_ids_0 + token_ids_1) + [1] return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
def load_spm(path: str) -> sentencepiece.SentencePieceProcessor: def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
spm = sentencepiece.SentencePieceProcessor() spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
spm.Load(path) spm.Load(path)
return spm return spm

View File

@@ -16,7 +16,7 @@
import os import os
from contextlib import contextmanager from contextlib import contextmanager
from shutil import copyfile from shutil import copyfile
from typing import Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm import sentencepiece as spm
@@ -76,6 +76,20 @@ class MBart50Tokenizer(PreTrainedTokenizer):
mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`): mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Examples:: Examples::
@@ -108,11 +122,14 @@ class MBart50Tokenizer(PreTrainedTokenizer):
unk_token="<unk>", unk_token="<unk>",
pad_token="<pad>", pad_token="<pad>",
mask_token="<mask>", mask_token="<mask>",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs **kwargs
): ) -> None:
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__( super().__init__(
src_lang=src_lang, src_lang=src_lang,
tgt_lang=tgt_lang, tgt_lang=tgt_lang,
@@ -122,10 +139,11 @@ class MBart50Tokenizer(PreTrainedTokenizer):
cls_token=cls_token, cls_token=cls_token,
pad_token=pad_token, pad_token=pad_token,
mask_token=mask_token, mask_token=mask_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
self.sp_model = spm.SentencePieceProcessor() self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file)) self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file self.vocab_file = vocab_file
@@ -177,7 +195,12 @@ class MBart50Tokenizer(PreTrainedTokenizer):
def __setstate__(self, d: Dict) -> None: def __setstate__(self, d: Dict) -> None:
self.__dict__ = d self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor()
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def get_vocab(self) -> Dict: def get_vocab(self) -> Dict:
@@ -186,7 +209,7 @@ class MBart50Tokenizer(PreTrainedTokenizer):
return vocab return vocab
def _tokenize(self, text: str) -> List[str]: def _tokenize(self, text: str) -> List[str]:
return self.sp_model.EncodeAsPieces(text) return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token: str) -> int: def _convert_token_to_id(self, token: str) -> int:
"""Converts a token (str) in an id using the vocab.""" """Converts a token (str) in an id using the vocab."""

View File

@@ -14,7 +14,7 @@
# limitations under the License. # limitations under the License.
import os import os
from shutil import copyfile from shutil import copyfile
from typing import Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm import sentencepiece as spm
@@ -77,6 +77,20 @@ class PegasusTokenizer(PreTrainedTokenizer):
tokenizer tokenizer
<https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__ <https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66>`__
that uses the tokens 2 - 104 only for pretraining that uses the tokens 2 - 104 only for pretraining
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -95,10 +109,10 @@ class PegasusTokenizer(PreTrainedTokenizer):
mask_token_sent="<mask_1>", mask_token_sent="<mask_1>",
additional_special_tokens=None, additional_special_tokens=None,
offset=103, # entries 2 - 104 are only used for pretraining offset=103, # entries 2 - 104 are only used for pretraining
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs **kwargs
): ) -> None:
self.offset = offset self.offset = offset
if additional_special_tokens is not None: if additional_special_tokens is not None:
assert isinstance( assert isinstance(
additional_special_tokens, list additional_special_tokens, list
@@ -123,6 +137,8 @@ class PegasusTokenizer(PreTrainedTokenizer):
additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else [] additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else []
additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)] additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__( super().__init__(
eos_token=eos_token, eos_token=eos_token,
unk_token=unk_token, unk_token=unk_token,
@@ -131,11 +147,12 @@ class PegasusTokenizer(PreTrainedTokenizer):
mask_token_sent=mask_token_sent, mask_token_sent=mask_token_sent,
offset=offset, offset=offset,
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
self.mask_token_sent = mask_token_sent self.mask_token_sent = mask_token_sent
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor() self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file) self.sp_model.Load(vocab_file)
# add special tokens to encoder dict # add special tokens to encoder dict
@@ -175,16 +192,17 @@ class PegasusTokenizer(PreTrainedTokenizer):
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__ = d self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor()
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def _tokenize(self, text, sample=False): def _tokenize(self, text: str) -> List[str]:
"""Take as input a string and return a list of strings (tokens) for words/sub-words""" """Take as input a string and return a list of strings (tokens) for words/sub-words"""
if not sample: return self.sp_model.encode(text, out_type=str)
pieces = self.sp_model.EncodeAsPieces(text)
else:
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
return pieces
def _convert_token_to_id(self, token: str) -> int: def _convert_token_to_id(self, token: str) -> int:
"""Converts a token (str) to an id using the vocab.""" """Converts a token (str) to an id using the vocab."""

View File

@@ -17,7 +17,7 @@
import os import os
from shutil import copyfile from shutil import copyfile
from typing import Dict, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm import sentencepiece as spm
@@ -68,6 +68,20 @@ class ReformerTokenizer(PreTrainedTokenizer):
The token used for padding, for example when batching sequences of different lengths. The token used for padding, for example when batching sequences of different lengths.
additional_special_tokens (:obj:`List[str]`, `optional`): additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -75,16 +89,27 @@ class ReformerTokenizer(PreTrainedTokenizer):
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
model_input_names = ["input_ids", "attention_mask"] model_input_names = ["input_ids", "attention_mask"]
def __init__(self, vocab_file, eos_token="</s>", unk_token="<unk>", additional_special_tokens=[], **kwargs): def __init__(
self,
vocab_file,
eos_token="</s>",
unk_token="<unk>",
additional_special_tokens=[],
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs
) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__( super().__init__(
eos_token=eos_token, eos_token=eos_token,
unk_token=unk_token, unk_token=unk_token,
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor() self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file) self.sp_model.Load(vocab_file)
@property @property
@@ -103,16 +128,17 @@ class ReformerTokenizer(PreTrainedTokenizer):
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__ = d self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor()
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def _tokenize(self, text, sample=False): def _tokenize(self, text: str) -> List[str]:
"""Take as input a string and return a list of strings (tokens) for words/sub-words""" """Take as input a string and return a list of strings (tokens) for words/sub-words"""
if not sample: return self.sp_model.encode(text, out_type=str)
pieces = self.sp_model.EncodeAsPieces(text)
else:
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
return pieces
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab.""" """Converts a token (str) in an id using the vocab."""

View File

@@ -17,7 +17,7 @@
import json import json
from pathlib import Path from pathlib import Path
from shutil import copyfile from shutil import copyfile
from typing import Dict, List, Optional, Tuple, Union from typing import Any, Dict, List, Optional, Tuple, Union
import sentencepiece import sentencepiece
@@ -79,6 +79,21 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
Whether or not to lowercase the input when tokenizing. Whether or not to lowercase the input when tokenizing.
tgt_lang (:obj:`str`, `optional`): tgt_lang (:obj:`str`, `optional`):
A string representing the target language. A string representing the target language.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
**kwargs **kwargs
Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer` Additional keyword arguments passed along to :class:`~transformers.PreTrainedTokenizer`
""" """
@@ -102,8 +117,11 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
do_lower_case=False, do_lower_case=False,
tgt_lang=None, tgt_lang=None,
lang_codes=None, lang_codes=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs, **kwargs,
): ) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__( super().__init__(
bos_token=bos_token, bos_token=bos_token,
eos_token=eos_token, eos_token=eos_token,
@@ -113,6 +131,7 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
do_lower_case=do_lower_case, do_lower_case=do_lower_case,
tgt_lang=tgt_lang, tgt_lang=tgt_lang,
lang_codes=lang_codes, lang_codes=lang_codes,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
self.do_upper_case = do_upper_case self.do_upper_case = do_upper_case
@@ -121,7 +140,7 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
self.encoder = load_json(vocab_file) self.encoder = load_json(vocab_file)
self.decoder = {v: k for k, v in self.encoder.items()} self.decoder = {v: k for k, v in self.encoder.items()}
self.spm_file = spm_file self.spm_file = spm_file
self.sp_model = load_spm(spm_file) self.sp_model = load_spm(spm_file, self.sp_model_kwargs)
if lang_codes is not None: if lang_codes is not None:
self.lang_codes = lang_codes self.lang_codes = lang_codes
@@ -155,7 +174,7 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
self.prefix_tokens = [lang_code_id] self.prefix_tokens = [lang_code_id]
def _tokenize(self, text: str) -> List[str]: def _tokenize(self, text: str) -> List[str]:
return self.sp_model.EncodeAsPieces(text) return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
return self.encoder.get(token, self.encoder[self.unk_token]) return self.encoder.get(token, self.encoder[self.unk_token])
@@ -221,7 +240,12 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
def __setstate__(self, d: Dict) -> None: def __setstate__(self, d: Dict) -> None:
self.__dict__ = d self.__dict__ = d
self.sp_model = load_spm(self.spm_file)
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = load_spm(self.spm_file, self.sp_model_kwargs)
def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]: def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
save_dir = Path(save_directory) save_dir = Path(save_directory)
@@ -241,8 +265,8 @@ class Speech2TextTokenizer(PreTrainedTokenizer):
return (str(vocab_save_path), str(spm_save_path)) return (str(vocab_save_path), str(spm_save_path))
def load_spm(path: str) -> sentencepiece.SentencePieceProcessor: def load_spm(path: str, sp_model_kwargs: Dict[str, Any]) -> sentencepiece.SentencePieceProcessor:
spm = sentencepiece.SentencePieceProcessor() spm = sentencepiece.SentencePieceProcessor(**sp_model_kwargs)
spm.Load(str(path)) spm.Load(str(path))
return spm return spm

View File

@@ -19,7 +19,7 @@ import os
import re import re
import warnings import warnings
from shutil import copyfile from shutil import copyfile
from typing import List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm import sentencepiece as spm
@@ -81,6 +81,20 @@ class T5Tokenizer(PreTrainedTokenizer):
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__). <https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
additional_special_tokens (:obj:`List[str]`, `optional`): additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes: Attributes:
sp_model (:obj:`SentencePieceProcessor`): sp_model (:obj:`SentencePieceProcessor`):
@@ -100,8 +114,9 @@ class T5Tokenizer(PreTrainedTokenizer):
pad_token="<pad>", pad_token="<pad>",
extra_ids=100, extra_ids=100,
additional_special_tokens=None, additional_special_tokens=None,
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs **kwargs
): ) -> None:
# Add extra_ids to the special token list # Add extra_ids to the special token list
if extra_ids > 0 and additional_special_tokens is None: if extra_ids > 0 and additional_special_tokens is None:
additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)] additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
@@ -114,19 +129,22 @@ class T5Tokenizer(PreTrainedTokenizer):
"In this case the additional_special_tokens must include the extra_ids tokens" "In this case the additional_special_tokens must include the extra_ids tokens"
) )
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__( super().__init__(
eos_token=eos_token, eos_token=eos_token,
unk_token=unk_token, unk_token=unk_token,
pad_token=pad_token, pad_token=pad_token,
extra_ids=extra_ids, extra_ids=extra_ids,
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
self.vocab_file = vocab_file self.vocab_file = vocab_file
self._extra_ids = extra_ids self._extra_ids = extra_ids
self.sp_model = spm.SentencePieceProcessor() self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file) self.sp_model.Load(vocab_file)
@property @property
@@ -231,16 +249,17 @@ class T5Tokenizer(PreTrainedTokenizer):
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__ = d self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor()
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def _tokenize(self, text, sample=False): def _tokenize(self, text: str) -> List[str]:
"""Take as input a string and return a list of strings (tokens) for words/sub-words""" """Take as input a string and return a list of strings (tokens) for words/sub-words"""
if not sample: return self.sp_model.encode(text, out_type=str)
pieces = self.sp_model.EncodeAsPieces(text)
else:
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
return pieces
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab.""" """Converts a token (str) in an id using the vocab."""

View File

@@ -16,7 +16,7 @@
import collections import collections
import os import os
from shutil import copyfile from shutil import copyfile
from typing import List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
from ...tokenization_utils import PreTrainedTokenizer from ...tokenization_utils import PreTrainedTokenizer
from ...utils import logging from ...utils import logging
@@ -96,6 +96,20 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes: Attributes:
sp_model (:obj:`SentencePieceProcessor`): sp_model (:obj:`SentencePieceProcessor`):
@@ -117,8 +131,11 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
pad_token="[PAD]", pad_token="[PAD]",
cls_token="[CLS]", cls_token="[CLS]",
mask_token="[MASK]", mask_token="[MASK]",
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs **kwargs
): ) -> None:
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__( super().__init__(
bos_token=bos_token, bos_token=bos_token,
eos_token=eos_token, eos_token=eos_token,
@@ -127,6 +144,7 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
pad_token=pad_token, pad_token=pad_token,
cls_token=cls_token, cls_token=cls_token,
mask_token=mask_token, mask_token=mask_token,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
@@ -139,7 +157,7 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
) )
raise raise
self.sp_model = spm.SentencePieceProcessor() self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(str(vocab_file)) self.sp_model.Load(str(vocab_file))
self.vocab_file = vocab_file self.vocab_file = vocab_file
@@ -177,7 +195,12 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
"pip install sentencepiece" "pip install sentencepiece"
) )
raise raise
self.sp_model = spm.SentencePieceProcessor()
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def get_special_tokens_mask( def get_special_tokens_mask(
@@ -241,8 +264,8 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
vocab.update(self.added_tokens_encoder) vocab.update(self.added_tokens_encoder)
return vocab return vocab
def _tokenize(self, text): def _tokenize(self, text: str) -> str:
return self.sp_model.EncodeAsPieces(text) return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab.""" """Converts a token (str) in an id using the vocab."""

View File

@@ -17,7 +17,7 @@
import os import os
from shutil import copyfile from shutil import copyfile
from typing import List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm import sentencepiece as spm
@@ -94,7 +94,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
sp_model_kwargs (:obj:`dict`, `optional`, defaults to :obj:`None`): sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set: <https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
@@ -129,9 +129,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
unk_token="<unk>", unk_token="<unk>",
pad_token="<pad>", pad_token="<pad>",
mask_token="<mask>", mask_token="<mask>",
sp_model_kwargs=None, sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs **kwargs
): ) -> None:
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
@@ -271,7 +271,7 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
vocab.update(self.added_tokens_encoder) vocab.update(self.added_tokens_encoder)
return vocab return vocab
def _tokenize(self, text): def _tokenize(self, text: str) -> List[str]:
return self.sp_model.encode(text, out_type=str) return self.sp_model.encode(text, out_type=str)
def _convert_token_to_id(self, token): def _convert_token_to_id(self, token):

View File

@@ -18,7 +18,7 @@
import os import os
import unicodedata import unicodedata
from shutil import copyfile from shutil import copyfile
from typing import List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
import sentencepiece as spm import sentencepiece as spm
@@ -99,6 +99,20 @@ class XLNetTokenizer(PreTrainedTokenizer):
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<eop>", "<eod>"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
sp_model_kwargs (:obj:`dict`, `optional`):
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
- ``enable_sampling``: Enable subword regularization.
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
- ``nbest_size = {0,1}``: No sampling is performed.
- ``nbest_size > 1``: samples from the nbest_size results.
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
using forward-filtering-and-backward-sampling algorithm.
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
BPE-dropout.
Attributes: Attributes:
sp_model (:obj:`SentencePieceProcessor`): sp_model (:obj:`SentencePieceProcessor`):
@@ -124,11 +138,14 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls_token="<cls>", cls_token="<cls>",
mask_token="<mask>", mask_token="<mask>",
additional_special_tokens=["<eop>", "<eod>"], additional_special_tokens=["<eop>", "<eod>"],
sp_model_kwargs: Optional[Dict[str, Any]] = None,
**kwargs **kwargs
): ) -> None:
# Mask token behave like a normal word, i.e. include the space before it # Mask token behave like a normal word, i.e. include the space before it
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
super().__init__( super().__init__(
do_lower_case=do_lower_case, do_lower_case=do_lower_case,
remove_space=remove_space, remove_space=remove_space,
@@ -141,6 +158,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
cls_token=cls_token, cls_token=cls_token,
mask_token=mask_token, mask_token=mask_token,
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
sp_model_kwargs=self.sp_model_kwargs,
**kwargs, **kwargs,
) )
@@ -151,7 +169,7 @@ class XLNetTokenizer(PreTrainedTokenizer):
self.keep_accents = keep_accents self.keep_accents = keep_accents
self.vocab_file = vocab_file self.vocab_file = vocab_file
self.sp_model = spm.SentencePieceProcessor() self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(vocab_file) self.sp_model.Load(vocab_file)
@property @property
@@ -170,7 +188,12 @@ class XLNetTokenizer(PreTrainedTokenizer):
def __setstate__(self, d): def __setstate__(self, d):
self.__dict__ = d self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor()
# for backward compatibility
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.vocab_file) self.sp_model.Load(self.vocab_file)
def preprocess_text(self, inputs): def preprocess_text(self, inputs):
@@ -188,14 +211,10 @@ class XLNetTokenizer(PreTrainedTokenizer):
return outputs return outputs
def _tokenize(self, text, sample=False): def _tokenize(self, text: str) -> List[str]:
"""Tokenize a string.""" """Tokenize a string."""
text = self.preprocess_text(text) text = self.preprocess_text(text)
pieces = self.sp_model.encode(text, out_type=str)
if not sample:
pieces = self.sp_model.EncodeAsPieces(text)
else:
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
new_pieces = [] new_pieces = []
for piece in pieces: for piece in pieces:
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import unittest import unittest
@@ -33,6 +32,8 @@ class AlbertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = AlbertTokenizer tokenizer_class = AlbertTokenizer
rust_tokenizer_class = AlbertTokenizerFast rust_tokenizer_class = AlbertTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
test_sentencepiece_ignore_case = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import unittest import unittest
from transformers import BarthezTokenizer, BarthezTokenizerFast, BatchEncoding from transformers import BarthezTokenizer, BarthezTokenizerFast, BatchEncoding
@@ -24,12 +23,13 @@ from .test_tokenization_common import TokenizerTesterMixin
@require_tokenizers @require_tokenizers
@require_sentencepiece @require_sentencepiece
@slow @slow # see https://github.com/huggingface/transformers/issues/11457
class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BarthezTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BarthezTokenizer tokenizer_class = BarthezTokenizer
rust_tokenizer_class = BarthezTokenizerFast rust_tokenizer_class = BarthezTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import unittest import unittest
@@ -33,6 +32,7 @@ SAMPLE_VOCAB = os.path.join(os.path.dirname(os.path.abspath(__file__)), "fixture
class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase): class BertGenerationTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BertGenerationTokenizer tokenizer_class = BertGenerationTokenizer
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import unittest import unittest
@@ -36,11 +35,12 @@ class BigBirdTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = BigBirdTokenizer tokenizer_class = BigBirdTokenizer
rust_tokenizer_class = BigBirdTokenizerFast rust_tokenizer_class = BigBirdTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
tokenizer = BigBirdTokenizer(SAMPLE_VOCAB, keep_accents=True) tokenizer = self.tokenizer_class(SAMPLE_VOCAB, keep_accents=True)
tokenizer.save_pretrained(self.tmpdirname) tokenizer.save_pretrained(self.tmpdirname)
def test_rust_and_python_full_tokenizers(self): def test_rust_and_python_full_tokenizers(self):

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import unittest import unittest
@@ -37,6 +36,7 @@ class CamembertTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = CamembertTokenizer tokenizer_class = CamembertTokenizer
rust_tokenizer_class = CamembertTokenizerFast rust_tokenizer_class = CamembertTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -15,6 +15,7 @@
import inspect import inspect
import itertools
import os import os
import pickle import pickle
import re import re
@@ -100,6 +101,13 @@ class TokenizerTesterMixin:
from_pretrained_vocab_key = "vocab_file" from_pretrained_vocab_key = "vocab_file"
test_seq2seq = True test_seq2seq = True
# set to True to test a sentencepiece tokenizer
test_sentencepiece = False
# set to True to ignore casing when testing a sentencepiece tokenizer
# test_sentencepiece must also be set to True
test_sentencepiece_ignore_case = False
def setUp(self) -> None: def setUp(self) -> None:
# Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the # Tokenizer.filter makes it possible to filter which Tokenizer to case based on all the
# information available in Tokenizer (name, rust class, python class, vocab key name) # information available in Tokenizer (name, rust class, python class, vocab key name)
@@ -216,6 +224,38 @@ class TokenizerTesterMixin:
for i in range(len(batch_encode_plus_sequences["input_ids"])) for i in range(len(batch_encode_plus_sequences["input_ids"]))
] ]
def test_subword_regularization_tokenizer(self) -> None:
if not self.test_sentencepiece:
return
# Subword regularization is only available for the slow tokenizer.
sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
self.assertTrue(hasattr(tokenizer, "sp_model_kwargs"))
self.assertIsNotNone(tokenizer.sp_model_kwargs)
self.assertTrue(isinstance(tokenizer.sp_model_kwargs, dict))
self.assertEqual(tokenizer.sp_model_kwargs, sp_model_kwargs)
self.check_subword_sampling(tokenizer)
def test_pickle_subword_regularization_tokenizer(self) -> None:
if not self.test_sentencepiece:
return
"""Google pickle __getstate__ __setstate__ if you are struggling with this."""
# Subword regularization is only available for the slow tokenizer.
sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
tokenizer = self.get_tokenizer(sp_model_kwargs=sp_model_kwargs)
tokenizer_bin = pickle.dumps(tokenizer)
del tokenizer
tokenizer_new = pickle.loads(tokenizer_bin)
self.assertTrue(hasattr(tokenizer_new, "sp_model_kwargs"))
self.assertIsNotNone(tokenizer_new.sp_model_kwargs)
self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict))
self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs)
self.check_subword_sampling(tokenizer_new)
def test_model_input_names_signature(self): def test_model_input_names_signature(self):
accepted_model_main_input_names = [ accepted_model_main_input_names = [
"input_ids", # nlp models "input_ids", # nlp models
@@ -1727,6 +1767,46 @@ class TokenizerTesterMixin:
# add pad_token_id to pass subsequent tests # add pad_token_id to pass subsequent tests
tokenizer.add_special_tokens({"pad_token": "<PAD>"}) tokenizer.add_special_tokens({"pad_token": "<PAD>"})
def check_subword_sampling(
self,
tokenizer: PreTrainedTokenizer,
text: str = None,
) -> None:
"""
Check if the tokenizer generates different results when subword regularization is enabled.
Subword regularization augments training data with subword sampling.
This has a random component.
Args:
tokenizer: The tokenizer to check.
text: The text to use for the checks.
"""
text = "This is a test for subword regularization." if text is None else text
if self.test_sentencepiece_ignore_case:
text = text.lower()
tokens_list = []
for _ in range(5):
tokens_list.append(tokenizer.tokenize(text))
# the list of different pairs of tokens_list
combinations = itertools.combinations(tokens_list, 2)
# check of sampling is done
subword_sampling_found = False
for combination in combinations:
if combination[0] != combination[1]:
subword_sampling_found = True
self.assertTrue(subword_sampling_found)
# check if converting back to original text works
for tokens in tokens_list:
if self.test_sentencepiece_ignore_case:
self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens).lower())
else:
self.assertEqual(text, tokenizer.convert_tokens_to_string(tokens))
@require_torch @require_torch
@slow @slow
def test_torch_encode_plus_sent_to_model(self): def test_torch_encode_plus_sent_to_model(self):

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import unittest import unittest
@@ -33,6 +32,8 @@ class DebertaV2TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = DebertaV2Tokenizer tokenizer_class = DebertaV2Tokenizer
rust_tokenizer_class = None rust_tokenizer_class = None
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True
test_sentencepiece_ignore_case = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -45,6 +45,7 @@ class M2M100TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = M2M100Tokenizer tokenizer_class = M2M100Tokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_seq2seq = False test_seq2seq = False
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import tempfile import tempfile
import unittest import unittest
@@ -50,6 +49,7 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MarianTokenizer tokenizer_class = MarianTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -38,6 +38,7 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MBart50Tokenizer tokenizer_class = MBart50Tokenizer
rust_tokenizer_class = MBart50TokenizerFast rust_tokenizer_class = MBart50TokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -31,6 +31,7 @@ class PegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = PegasusTokenizer tokenizer_class = PegasusTokenizer
rust_tokenizer_class = PegasusTokenizerFast rust_tokenizer_class = PegasusTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
@@ -104,6 +105,7 @@ class BigBirdPegasusTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = PegasusTokenizer tokenizer_class = PegasusTokenizer
rust_tokenizer_class = PegasusTokenizerFast rust_tokenizer_class = PegasusTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import unittest import unittest
@@ -34,6 +33,7 @@ class ReformerTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
rust_tokenizer_class = ReformerTokenizerFast rust_tokenizer_class = ReformerTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_seq2seq = False test_seq2seq = False
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -40,6 +40,7 @@ ES_CODE = 10
class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase): class SpeechToTextTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = Speech2TextTokenizer tokenizer_class = Speech2TextTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import unittest import unittest
from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast from transformers import SPIECE_UNDERLINE, AddedToken, BatchEncoding, T5Tokenizer, T5TokenizerFast
@@ -40,6 +39,7 @@ class T5TokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = T5Tokenizer tokenizer_class = T5Tokenizer
rust_tokenizer_class = T5TokenizerFast rust_tokenizer_class = T5TokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import unittest import unittest
@@ -32,6 +31,7 @@ class XLMProphetNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMProphetNetTokenizer tokenizer_class = XLMProphetNetTokenizer
test_rust_tokenizer = False test_rust_tokenizer = False
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()

View File

@@ -13,10 +13,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import itertools
import os import os
import pickle
import unittest import unittest
from transformers import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast from transformers import SPIECE_UNDERLINE, XLMRobertaTokenizer, XLMRobertaTokenizerFast
@@ -36,6 +33,7 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLMRobertaTokenizer tokenizer_class = XLMRobertaTokenizer
rust_tokenizer_class = XLMRobertaTokenizerFast rust_tokenizer_class = XLMRobertaTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()
@@ -120,41 +118,6 @@ class XLMRobertaTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
], ],
) )
def test_subword_regularization_tokenizer(self):
# Subword regularization is only available for the slow tokenizer.
tokenizer = XLMRobertaTokenizer(
SAMPLE_VOCAB, keep_accents=True, sp_model_kwargs={"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
)
# Subword regularization augments training data with subword sampling.
# This has a random component. We test if the tokenizer generates different
# results when subword regularization is enabled.
tokens_list = []
for _ in range(5):
tokens_list.append(tokenizer.tokenize("This is a test for subword regularization."))
# the list of different pairs of tokens_list
combinations = itertools.combinations(tokens_list, 2)
all_equal = True
for combination in combinations:
if combination[0] != combination[1]:
all_equal = False
self.assertFalse(all_equal)
def test_pickle_subword_regularization_tokenizer(self):
"""Google pickle __getstate__ __setstate__ if you are struggling with this."""
# Subword regularization is only available for the slow tokenizer.
sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
tokenizer = XLMRobertaTokenizer(SAMPLE_VOCAB, keep_accents=True, sp_model_kwargs=sp_model_kwargs)
tokenizer_bin = pickle.dumps(tokenizer)
tokenizer_new = pickle.loads(tokenizer_bin)
self.assertIsNotNone(tokenizer_new.sp_model_kwargs)
self.assertTrue(isinstance(tokenizer_new.sp_model_kwargs, dict))
self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs)
@cached_property @cached_property
def big_tokenizer(self): def big_tokenizer(self):
return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base") return XLMRobertaTokenizer.from_pretrained("xlm-roberta-base")

View File

@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os import os
import unittest import unittest
@@ -33,6 +32,7 @@ class XLNetTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = XLNetTokenizer tokenizer_class = XLNetTokenizer
rust_tokenizer_class = XLNetTokenizerFast rust_tokenizer_class = XLNetTokenizerFast
test_rust_tokenizer = True test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self): def setUp(self):
super().setUp() super().setUp()