Add from_slow in fast tokenizers build and fixes some bugs (#9987)

This commit is contained in:
Sylvain Gugger
2021-02-04 03:34:23 -05:00
committed by GitHub
parent 6244727e05
commit 7898fc03b1
16 changed files with 50 additions and 38 deletions

View File

@@ -103,8 +103,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every Attributes:
conversion (string, tokens and IDs). sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -88,8 +88,11 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
Whether or not to keep accents when tokenizing. Whether or not to keep accents when tokenizing.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note:: When building a sequence using special tokens, this is not the token that is used for the
beginning of sequence. The token used is the :obj:`cls_token`. .. note::
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
that is used for the end of sequence. The token used is the :obj:`sep_token`. that is used for the end of sequence. The token used is the :obj:`sep_token`.
@@ -107,9 +110,7 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
instead of per-token classification). It is the first token of the sequence when built with special tokens. instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. Attributes: modeling. This is the token which the model will try to predict.
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -252,6 +252,7 @@ NO_CONFIG_TOKENIZER = [
HerbertTokenizerFast, HerbertTokenizerFast,
PhobertTokenizer, PhobertTokenizer,
BarthezTokenizer, BarthezTokenizer,
BarthezTokenizerFast,
] ]

View File

@@ -38,11 +38,9 @@ class BartTokenizer(RobertaTokenizer):
r""" r"""
Construct a BART tokenizer. Construct a BART tokenizer.
:class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to superclass
:meth:`~transformers.BartTokenizer.prepare_seq2seq_batch` :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization
parameters and other methods.
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the
initialization parameters and other methods.
""" """
# merges and vocab same as Roberta # merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models} max_model_input_sizes = {m: 1024 for m in _all_bart_models}

View File

@@ -37,6 +37,13 @@ _all_bart_models = [
class BartTokenizerFast(RobertaTokenizerFast): class BartTokenizerFast(RobertaTokenizerFast):
r"""
Construct a "fast" BART tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.BartTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer to
superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the
initialization parameters and other methods.
"""
# merges and vocab same as Roberta # merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models} max_model_input_sizes = {m: 1024 for m in _all_bart_models}
pretrained_vocab_files_map = { pretrained_vocab_files_map = {

View File

@@ -90,8 +90,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every Attributes:
conversion (string, tokens and IDs). sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -98,9 +98,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -36,7 +36,7 @@ class BlenderbotTokenizer(RobertaTokenizer):
Construct a Blenderbot tokenizer. Construct a Blenderbot tokenizer.
:class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs :class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesnt add BOS token end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
to the beginning of sequences. to the beginning of sequences.
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning

View File

@@ -93,8 +93,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every Attributes:
conversion (string, tokens and IDs). sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -101,10 +101,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -92,6 +92,10 @@ class T5Tokenizer(PreTrainedTokenizer):
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__). <https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
additional_special_tokens (:obj:`List[str]`, `optional`): additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -97,8 +97,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every Attributes:
conversion (string, tokens and IDs). sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -95,8 +95,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every Attributes:
conversion (string, tokens and IDs). sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -106,9 +106,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict. modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer. Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -1793,12 +1793,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
def _from_pretrained( def _from_pretrained(
cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
): ):
# We instantiate fast tokenizers based on a slow tokenizer for now # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
# In the future we can also use a direct way based on saving/instantiating # file or if `from_slow` is set to True.
# tokenizer's Tokenizer directly from it's serialization JSON from_slow = kwargs.get("from_slow", False)
if ( has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
"tokenizer_file" not in resolved_vocab_files or resolved_vocab_files["tokenizer_file"] is None if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
) and cls.slow_tokenizer_class is not None:
slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained( slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
copy.deepcopy(resolved_vocab_files), copy.deepcopy(resolved_vocab_files),
pretrained_model_name_or_path, pretrained_model_name_or_path,

View File

@@ -80,8 +80,15 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
slow_tokenizer = kwargs.pop("__slow_tokenizer", None) slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
fast_tokenizer_file = kwargs.pop("tokenizer_file", None) fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
from_slow = kwargs.pop("from_slow", False)
if fast_tokenizer_file is not None: if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
raise ValueError(
"Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
"have sentencepiece installed."
)
if fast_tokenizer_file is not None and not from_slow:
# We have a serialization from tokenizers which let us directly build the backend # We have a serialization from tokenizers which let us directly build the backend
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file) fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
elif slow_tokenizer is not None: elif slow_tokenizer is not None: