Add from_slow in fast tokenizers build and fixes some bugs (#9987)

This commit is contained in:
Sylvain Gugger
2021-02-04 03:34:23 -05:00
committed by GitHub
parent 6244727e05
commit 7898fc03b1
16 changed files with 50 additions and 38 deletions

View File

@@ -103,8 +103,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -88,8 +88,11 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
Whether or not to keep accents when tokenizing.
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
.. note:: When building a sequence using special tokens, this is not the token that is used for the
beginning of sequence. The token used is the :obj:`cls_token`.
.. note::
When building a sequence using special tokens, this is not the token that is used for the beginning of
sequence. The token used is the :obj:`cls_token`.
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
that is used for the end of sequence. The token used is the :obj:`sep_token`.
@@ -107,9 +110,7 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
instead of per-token classification). It is the first token of the sequence when built with special tokens.
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
The token used for masking values. This is the token used when training this model with masked language
modeling. This is the token which the model will try to predict. Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
modeling. This is the token which the model will try to predict.
"""
vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -252,6 +252,7 @@ NO_CONFIG_TOKENIZER = [
HerbertTokenizerFast,
PhobertTokenizer,
BarthezTokenizer,
BarthezTokenizerFast,
]

View File

@@ -38,11 +38,9 @@ class BartTokenizer(RobertaTokenizer):
r"""
Construct a BART tokenizer.
:class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new
:meth:`~transformers.BartTokenizer.prepare_seq2seq_batch`
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the
initialization parameters and other methods.
:class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to superclass
:class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization
parameters and other methods.
"""
# merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models}

View File

@@ -37,6 +37,13 @@ _all_bart_models = [
class BartTokenizerFast(RobertaTokenizerFast):
r"""
Construct a "fast" BART tokenizer (backed by HuggingFace's `tokenizers` library).
:class:`~transformers.BartTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer to
superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the
initialization parameters and other methods.
"""
# merges and vocab same as Roberta
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
pretrained_vocab_files_map = {

View File

@@ -90,8 +90,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -98,9 +98,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -36,7 +36,7 @@ class BlenderbotTokenizer(RobertaTokenizer):
Construct a Blenderbot tokenizer.
:class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesnt add BOS token
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
to the beginning of sequences.
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning

View File

@@ -93,8 +93,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -101,10 +101,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -92,6 +92,10 @@ class T5Tokenizer(PreTrainedTokenizer):
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
additional_special_tokens (:obj:`List[str]`, `optional`):
Additional special tokens used by the tokenizer.
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -97,8 +97,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -95,8 +95,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
Attributes:
sp_model (:obj:`SentencePieceProcessor`):
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -106,9 +106,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
modeling. This is the token which the model will try to predict.
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
Additional special tokens used by the tokenizer.
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
conversion (string, tokens and IDs).
"""
vocab_files_names = VOCAB_FILES_NAMES

View File

@@ -1793,12 +1793,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
def _from_pretrained(
cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
):
# We instantiate fast tokenizers based on a slow tokenizer for now
# In the future we can also use a direct way based on saving/instantiating
# tokenizer's Tokenizer directly from it's serialization JSON
if (
"tokenizer_file" not in resolved_vocab_files or resolved_vocab_files["tokenizer_file"] is None
) and cls.slow_tokenizer_class is not None:
# We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
# file or if `from_slow` is set to True.
from_slow = kwargs.get("from_slow", False)
has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
copy.deepcopy(resolved_vocab_files),
pretrained_model_name_or_path,

View File

@@ -80,8 +80,15 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
def __init__(self, *args, **kwargs):
slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
from_slow = kwargs.pop("from_slow", False)
if fast_tokenizer_file is not None:
if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
raise ValueError(
"Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
"have sentencepiece installed."
)
if fast_tokenizer_file is not None and not from_slow:
# We have a serialization from tokenizers which let us directly build the backend
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
elif slow_tokenizer is not None: