Add from_slow in fast tokenizers build and fixes some bugs (#9987)
This commit is contained in:
@@ -103,8 +103,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
||||
The token used for masking values. This is the token used when training this model with masked language
|
||||
modeling. This is the token which the model will try to predict.
|
||||
|
||||
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
|
||||
conversion (string, tokens and IDs).
|
||||
Attributes:
|
||||
sp_model (:obj:`SentencePieceProcessor`):
|
||||
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
@@ -88,8 +88,11 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
|
||||
Whether or not to keep accents when tokenizing.
|
||||
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
|
||||
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
||||
.. note:: When building a sequence using special tokens, this is not the token that is used for the
|
||||
beginning of sequence. The token used is the :obj:`cls_token`.
|
||||
|
||||
.. note::
|
||||
|
||||
When building a sequence using special tokens, this is not the token that is used for the beginning of
|
||||
sequence. The token used is the :obj:`cls_token`.
|
||||
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
|
||||
The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
|
||||
that is used for the end of sequence. The token used is the :obj:`sep_token`.
|
||||
@@ -107,9 +110,7 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
|
||||
instead of per-token classification). It is the first token of the sequence when built with special tokens.
|
||||
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
|
||||
The token used for masking values. This is the token used when training this model with masked language
|
||||
modeling. This is the token which the model will try to predict. Attributes:
|
||||
sp_model (:obj:`SentencePieceProcessor`):
|
||||
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||
modeling. This is the token which the model will try to predict.
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
@@ -252,6 +252,7 @@ NO_CONFIG_TOKENIZER = [
|
||||
HerbertTokenizerFast,
|
||||
PhobertTokenizer,
|
||||
BarthezTokenizer,
|
||||
BarthezTokenizerFast,
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -38,11 +38,9 @@ class BartTokenizer(RobertaTokenizer):
|
||||
r"""
|
||||
Construct a BART tokenizer.
|
||||
|
||||
:class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new
|
||||
:meth:`~transformers.BartTokenizer.prepare_seq2seq_batch`
|
||||
|
||||
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the
|
||||
initialization parameters and other methods.
|
||||
:class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to superclass
|
||||
:class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization
|
||||
parameters and other methods.
|
||||
"""
|
||||
# merges and vocab same as Roberta
|
||||
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
|
||||
|
||||
@@ -37,6 +37,13 @@ _all_bart_models = [
|
||||
|
||||
|
||||
class BartTokenizerFast(RobertaTokenizerFast):
|
||||
r"""
|
||||
Construct a "fast" BART tokenizer (backed by HuggingFace's `tokenizers` library).
|
||||
|
||||
:class:`~transformers.BartTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer to
|
||||
superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the
|
||||
initialization parameters and other methods.
|
||||
"""
|
||||
# merges and vocab same as Roberta
|
||||
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
|
||||
pretrained_vocab_files_map = {
|
||||
|
||||
@@ -90,8 +90,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
|
||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||
Additional special tokens used by the tokenizer.
|
||||
|
||||
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
|
||||
conversion (string, tokens and IDs).
|
||||
Attributes:
|
||||
sp_model (:obj:`SentencePieceProcessor`):
|
||||
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
@@ -98,9 +98,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
|
||||
modeling. This is the token which the model will try to predict.
|
||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||
Additional special tokens used by the tokenizer.
|
||||
|
||||
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
|
||||
conversion (string, tokens and IDs).
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
@@ -36,7 +36,7 @@ class BlenderbotTokenizer(RobertaTokenizer):
|
||||
Construct a Blenderbot tokenizer.
|
||||
|
||||
:class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
|
||||
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesnt add BOS token
|
||||
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
|
||||
to the beginning of sequences.
|
||||
|
||||
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
|
||||
|
||||
@@ -93,8 +93,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||
Additional special tokens used by the tokenizer.
|
||||
|
||||
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
|
||||
conversion (string, tokens and IDs).
|
||||
Attributes:
|
||||
sp_model (:obj:`SentencePieceProcessor`):
|
||||
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
@@ -101,10 +101,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
|
||||
modeling. This is the token which the model will try to predict.
|
||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||
Additional special tokens used by the tokenizer.
|
||||
|
||||
Attributes:
|
||||
sp_model (:obj:`SentencePieceProcessor`):
|
||||
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
@@ -92,6 +92,10 @@ class T5Tokenizer(PreTrainedTokenizer):
|
||||
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
|
||||
additional_special_tokens (:obj:`List[str]`, `optional`):
|
||||
Additional special tokens used by the tokenizer.
|
||||
|
||||
Attributes:
|
||||
sp_model (:obj:`SentencePieceProcessor`):
|
||||
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
@@ -97,8 +97,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
|
||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||
Additional special tokens used by the tokenizer.
|
||||
|
||||
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
|
||||
conversion (string, tokens and IDs).
|
||||
Attributes:
|
||||
sp_model (:obj:`SentencePieceProcessor`):
|
||||
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
@@ -95,8 +95,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||
Additional special tokens used by the tokenizer.
|
||||
|
||||
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
|
||||
conversion (string, tokens and IDs).
|
||||
Attributes:
|
||||
sp_model (:obj:`SentencePieceProcessor`):
|
||||
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
@@ -106,9 +106,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
|
||||
modeling. This is the token which the model will try to predict.
|
||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||
Additional special tokens used by the tokenizer.
|
||||
|
||||
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
|
||||
conversion (string, tokens and IDs).
|
||||
"""
|
||||
|
||||
vocab_files_names = VOCAB_FILES_NAMES
|
||||
|
||||
@@ -1793,12 +1793,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
||||
def _from_pretrained(
|
||||
cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
|
||||
):
|
||||
# We instantiate fast tokenizers based on a slow tokenizer for now
|
||||
# In the future we can also use a direct way based on saving/instantiating
|
||||
# tokenizer's Tokenizer directly from it's serialization JSON
|
||||
if (
|
||||
"tokenizer_file" not in resolved_vocab_files or resolved_vocab_files["tokenizer_file"] is None
|
||||
) and cls.slow_tokenizer_class is not None:
|
||||
# We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
|
||||
# file or if `from_slow` is set to True.
|
||||
from_slow = kwargs.get("from_slow", False)
|
||||
has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
|
||||
if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
|
||||
slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
|
||||
copy.deepcopy(resolved_vocab_files),
|
||||
pretrained_model_name_or_path,
|
||||
|
||||
@@ -80,8 +80,15 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
||||
def __init__(self, *args, **kwargs):
|
||||
slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
|
||||
fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
|
||||
from_slow = kwargs.pop("from_slow", False)
|
||||
|
||||
if fast_tokenizer_file is not None:
|
||||
if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
|
||||
raise ValueError(
|
||||
"Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
|
||||
"have sentencepiece installed."
|
||||
)
|
||||
|
||||
if fast_tokenizer_file is not None and not from_slow:
|
||||
# We have a serialization from tokenizers which let us directly build the backend
|
||||
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
|
||||
elif slow_tokenizer is not None:
|
||||
|
||||
Reference in New Issue
Block a user