Add from_slow in fast tokenizers build and fixes some bugs (#9987)
This commit is contained in:
@@ -103,8 +103,9 @@ class AlbertTokenizer(PreTrainedTokenizer):
|
|||||||
The token used for masking values. This is the token used when training this model with masked language
|
The token used for masking values. This is the token used when training this model with masked language
|
||||||
modeling. This is the token which the model will try to predict.
|
modeling. This is the token which the model will try to predict.
|
||||||
|
|
||||||
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
|
Attributes:
|
||||||
conversion (string, tokens and IDs).
|
sp_model (:obj:`SentencePieceProcessor`):
|
||||||
|
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -88,8 +88,11 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
Whether or not to keep accents when tokenizing.
|
Whether or not to keep accents when tokenizing.
|
||||||
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
|
bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`):
|
||||||
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
||||||
.. note:: When building a sequence using special tokens, this is not the token that is used for the
|
|
||||||
beginning of sequence. The token used is the :obj:`cls_token`.
|
.. note::
|
||||||
|
|
||||||
|
When building a sequence using special tokens, this is not the token that is used for the beginning of
|
||||||
|
sequence. The token used is the :obj:`cls_token`.
|
||||||
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
|
eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`):
|
||||||
The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
|
The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
|
||||||
that is used for the end of sequence. The token used is the :obj:`sep_token`.
|
that is used for the end of sequence. The token used is the :obj:`sep_token`.
|
||||||
@@ -107,9 +110,7 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
instead of per-token classification). It is the first token of the sequence when built with special tokens.
|
instead of per-token classification). It is the first token of the sequence when built with special tokens.
|
||||||
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
|
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
|
||||||
The token used for masking values. This is the token used when training this model with masked language
|
The token used for masking values. This is the token used when training this model with masked language
|
||||||
modeling. This is the token which the model will try to predict. Attributes:
|
modeling. This is the token which the model will try to predict.
|
||||||
sp_model (:obj:`SentencePieceProcessor`):
|
|
||||||
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -252,6 +252,7 @@ NO_CONFIG_TOKENIZER = [
|
|||||||
HerbertTokenizerFast,
|
HerbertTokenizerFast,
|
||||||
PhobertTokenizer,
|
PhobertTokenizer,
|
||||||
BarthezTokenizer,
|
BarthezTokenizer,
|
||||||
|
BarthezTokenizerFast,
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -38,11 +38,9 @@ class BartTokenizer(RobertaTokenizer):
|
|||||||
r"""
|
r"""
|
||||||
Construct a BART tokenizer.
|
Construct a BART tokenizer.
|
||||||
|
|
||||||
:class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new
|
:class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to superclass
|
||||||
:meth:`~transformers.BartTokenizer.prepare_seq2seq_batch`
|
:class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization
|
||||||
|
parameters and other methods.
|
||||||
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the
|
|
||||||
initialization parameters and other methods.
|
|
||||||
"""
|
"""
|
||||||
# merges and vocab same as Roberta
|
# merges and vocab same as Roberta
|
||||||
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
|
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
|
||||||
|
|||||||
@@ -37,6 +37,13 @@ _all_bart_models = [
|
|||||||
|
|
||||||
|
|
||||||
class BartTokenizerFast(RobertaTokenizerFast):
|
class BartTokenizerFast(RobertaTokenizerFast):
|
||||||
|
r"""
|
||||||
|
Construct a "fast" BART tokenizer (backed by HuggingFace's `tokenizers` library).
|
||||||
|
|
||||||
|
:class:`~transformers.BartTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer to
|
||||||
|
superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the
|
||||||
|
initialization parameters and other methods.
|
||||||
|
"""
|
||||||
# merges and vocab same as Roberta
|
# merges and vocab same as Roberta
|
||||||
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
|
max_model_input_sizes = {m: 1024 for m in _all_bart_models}
|
||||||
pretrained_vocab_files_map = {
|
pretrained_vocab_files_map = {
|
||||||
|
|||||||
@@ -90,8 +90,9 @@ class BarthezTokenizer(PreTrainedTokenizer):
|
|||||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||||
Additional special tokens used by the tokenizer.
|
Additional special tokens used by the tokenizer.
|
||||||
|
|
||||||
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
|
Attributes:
|
||||||
conversion (string, tokens and IDs).
|
sp_model (:obj:`SentencePieceProcessor`):
|
||||||
|
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -98,9 +98,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
modeling. This is the token which the model will try to predict.
|
modeling. This is the token which the model will try to predict.
|
||||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||||
Additional special tokens used by the tokenizer.
|
Additional special tokens used by the tokenizer.
|
||||||
|
|
||||||
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
|
|
||||||
conversion (string, tokens and IDs).
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ class BlenderbotTokenizer(RobertaTokenizer):
|
|||||||
Construct a Blenderbot tokenizer.
|
Construct a Blenderbot tokenizer.
|
||||||
|
|
||||||
:class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
|
:class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs
|
||||||
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesnt add BOS token
|
end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token
|
||||||
to the beginning of sequences.
|
to the beginning of sequences.
|
||||||
|
|
||||||
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
|
Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning
|
||||||
|
|||||||
@@ -93,8 +93,9 @@ class CamembertTokenizer(PreTrainedTokenizer):
|
|||||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||||
Additional special tokens used by the tokenizer.
|
Additional special tokens used by the tokenizer.
|
||||||
|
|
||||||
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
|
Attributes:
|
||||||
conversion (string, tokens and IDs).
|
sp_model (:obj:`SentencePieceProcessor`):
|
||||||
|
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -101,10 +101,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
modeling. This is the token which the model will try to predict.
|
modeling. This is the token which the model will try to predict.
|
||||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||||
Additional special tokens used by the tokenizer.
|
Additional special tokens used by the tokenizer.
|
||||||
|
|
||||||
Attributes:
|
|
||||||
sp_model (:obj:`SentencePieceProcessor`):
|
|
||||||
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -92,6 +92,10 @@ class T5Tokenizer(PreTrainedTokenizer):
|
|||||||
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
|
<https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117>`__).
|
||||||
additional_special_tokens (:obj:`List[str]`, `optional`):
|
additional_special_tokens (:obj:`List[str]`, `optional`):
|
||||||
Additional special tokens used by the tokenizer.
|
Additional special tokens used by the tokenizer.
|
||||||
|
|
||||||
|
Attributes:
|
||||||
|
sp_model (:obj:`SentencePieceProcessor`):
|
||||||
|
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -97,8 +97,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer):
|
|||||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||||
Additional special tokens used by the tokenizer.
|
Additional special tokens used by the tokenizer.
|
||||||
|
|
||||||
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
|
Attributes:
|
||||||
conversion (string, tokens and IDs).
|
sp_model (:obj:`SentencePieceProcessor`):
|
||||||
|
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -95,8 +95,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer):
|
|||||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||||
Additional special tokens used by the tokenizer.
|
Additional special tokens used by the tokenizer.
|
||||||
|
|
||||||
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
|
Attributes:
|
||||||
conversion (string, tokens and IDs).
|
sp_model (:obj:`SentencePieceProcessor`):
|
||||||
|
The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -106,9 +106,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
modeling. This is the token which the model will try to predict.
|
modeling. This is the token which the model will try to predict.
|
||||||
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
|
||||||
Additional special tokens used by the tokenizer.
|
Additional special tokens used by the tokenizer.
|
||||||
|
|
||||||
Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every
|
|
||||||
conversion (string, tokens and IDs).
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vocab_files_names = VOCAB_FILES_NAMES
|
vocab_files_names = VOCAB_FILES_NAMES
|
||||||
|
|||||||
@@ -1793,12 +1793,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin):
|
|||||||
def _from_pretrained(
|
def _from_pretrained(
|
||||||
cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
|
cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs
|
||||||
):
|
):
|
||||||
# We instantiate fast tokenizers based on a slow tokenizer for now
|
# We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
|
||||||
# In the future we can also use a direct way based on saving/instantiating
|
# file or if `from_slow` is set to True.
|
||||||
# tokenizer's Tokenizer directly from it's serialization JSON
|
from_slow = kwargs.get("from_slow", False)
|
||||||
if (
|
has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
|
||||||
"tokenizer_file" not in resolved_vocab_files or resolved_vocab_files["tokenizer_file"] is None
|
if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None:
|
||||||
) and cls.slow_tokenizer_class is not None:
|
|
||||||
slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
|
slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
|
||||||
copy.deepcopy(resolved_vocab_files),
|
copy.deepcopy(resolved_vocab_files),
|
||||||
pretrained_model_name_or_path,
|
pretrained_model_name_or_path,
|
||||||
|
|||||||
@@ -80,8 +80,15 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):
|
|||||||
def __init__(self, *args, **kwargs):
|
def __init__(self, *args, **kwargs):
|
||||||
slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
|
slow_tokenizer = kwargs.pop("__slow_tokenizer", None)
|
||||||
fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
|
fast_tokenizer_file = kwargs.pop("tokenizer_file", None)
|
||||||
|
from_slow = kwargs.pop("from_slow", False)
|
||||||
|
|
||||||
if fast_tokenizer_file is not None:
|
if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None:
|
||||||
|
raise ValueError(
|
||||||
|
"Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you "
|
||||||
|
"have sentencepiece installed."
|
||||||
|
)
|
||||||
|
|
||||||
|
if fast_tokenizer_file is not None and not from_slow:
|
||||||
# We have a serialization from tokenizers which let us directly build the backend
|
# We have a serialization from tokenizers which let us directly build the backend
|
||||||
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
|
fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file)
|
||||||
elif slow_tokenizer is not None:
|
elif slow_tokenizer is not None:
|
||||||
|
|||||||
Reference in New Issue
Block a user