From 7898fc03b1dc837af8ff6e671aa72372b8802a86 Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 4 Feb 2021 03:34:23 -0500 Subject: [PATCH] Add `from_slow` in fast tokenizers build and fixes some bugs (#9987) --- src/transformers/models/albert/tokenization_albert.py | 5 +++-- .../models/albert/tokenization_albert_fast.py | 11 ++++++----- src/transformers/models/auto/tokenization_auto.py | 1 + src/transformers/models/bart/tokenization_bart.py | 8 +++----- .../models/bart/tokenization_bart_fast.py | 7 +++++++ .../models/barthez/tokenization_barthez.py | 5 +++-- .../models/barthez/tokenization_barthez_fast.py | 3 --- .../models/blenderbot/tokenization_blenderbot.py | 2 +- .../models/camembert/tokenization_camembert.py | 5 +++-- .../models/camembert/tokenization_camembert_fast.py | 4 ---- src/transformers/models/t5/tokenization_t5.py | 4 ++++ .../xlm_prophetnet/tokenization_xlm_prophetnet.py | 5 +++-- .../models/xlm_roberta/tokenization_xlm_roberta.py | 5 +++-- .../xlm_roberta/tokenization_xlm_roberta_fast.py | 3 --- src/transformers/tokenization_utils_base.py | 11 +++++------ src/transformers/tokenization_utils_fast.py | 9 ++++++++- 16 files changed, 50 insertions(+), 38 deletions(-) diff --git a/src/transformers/models/albert/tokenization_albert.py b/src/transformers/models/albert/tokenization_albert.py index a9bb75e95f..890c7f8707 100644 --- a/src/transformers/models/albert/tokenization_albert.py +++ b/src/transformers/models/albert/tokenization_albert.py @@ -103,8 +103,9 @@ class AlbertTokenizer(PreTrainedTokenizer): The token used for masking values. This is the token used when training this model with masked language modeling. This is the token which the model will try to predict. - Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every - conversion (string, tokens and IDs). + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/albert/tokenization_albert_fast.py b/src/transformers/models/albert/tokenization_albert_fast.py index abc305e4f1..5cfa584386 100644 --- a/src/transformers/models/albert/tokenization_albert_fast.py +++ b/src/transformers/models/albert/tokenization_albert_fast.py @@ -88,8 +88,11 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast): Whether or not to keep accents when tokenizing. bos_token (:obj:`str`, `optional`, defaults to :obj:`"[CLS]"`): The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token. - .. note:: When building a sequence using special tokens, this is not the token that is used for the - beginning of sequence. The token used is the :obj:`cls_token`. + + .. note:: + + When building a sequence using special tokens, this is not the token that is used for the beginning of + sequence. The token used is the :obj:`cls_token`. eos_token (:obj:`str`, `optional`, defaults to :obj:`"[SEP]"`): The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token that is used for the end of sequence. The token used is the :obj:`sep_token`. @@ -107,9 +110,7 @@ class AlbertTokenizerFast(PreTrainedTokenizerFast): instead of per-token classification). It is the first token of the sequence when built with special tokens. mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`): The token used for masking values. This is the token used when training this model with masked language - modeling. This is the token which the model will try to predict. Attributes: - sp_model (:obj:`SentencePieceProcessor`): - The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). + modeling. This is the token which the model will try to predict. """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/auto/tokenization_auto.py b/src/transformers/models/auto/tokenization_auto.py index 06bac0c2ff..470e8ce8e8 100644 --- a/src/transformers/models/auto/tokenization_auto.py +++ b/src/transformers/models/auto/tokenization_auto.py @@ -252,6 +252,7 @@ NO_CONFIG_TOKENIZER = [ HerbertTokenizerFast, PhobertTokenizer, BarthezTokenizer, + BarthezTokenizerFast, ] diff --git a/src/transformers/models/bart/tokenization_bart.py b/src/transformers/models/bart/tokenization_bart.py index 4a468d811e..eea85b00cd 100644 --- a/src/transformers/models/bart/tokenization_bart.py +++ b/src/transformers/models/bart/tokenization_bart.py @@ -38,11 +38,9 @@ class BartTokenizer(RobertaTokenizer): r""" Construct a BART tokenizer. - :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer` and adds a new - :meth:`~transformers.BartTokenizer.prepare_seq2seq_batch` - - Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the - initialization parameters and other methods. + :class:`~transformers.BartTokenizer` is identical to :class:`~transformers.RobertaTokenizer`. Refer to superclass + :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning the initialization + parameters and other methods. """ # merges and vocab same as Roberta max_model_input_sizes = {m: 1024 for m in _all_bart_models} diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py index 87ae615821..83fca126fa 100644 --- a/src/transformers/models/bart/tokenization_bart_fast.py +++ b/src/transformers/models/bart/tokenization_bart_fast.py @@ -37,6 +37,13 @@ _all_bart_models = [ class BartTokenizerFast(RobertaTokenizerFast): + r""" + Construct a "fast" BART tokenizer (backed by HuggingFace's `tokenizers` library). + + :class:`~transformers.BartTokenizerFast` is identical to :class:`~transformers.RobertaTokenizerFast`. Refer to + superclass :class:`~transformers.RobertaTokenizerFast` for usage examples and documentation concerning the + initialization parameters and other methods. + """ # merges and vocab same as Roberta max_model_input_sizes = {m: 1024 for m in _all_bart_models} pretrained_vocab_files_map = { diff --git a/src/transformers/models/barthez/tokenization_barthez.py b/src/transformers/models/barthez/tokenization_barthez.py index f9d13681da..d751de0e0c 100644 --- a/src/transformers/models/barthez/tokenization_barthez.py +++ b/src/transformers/models/barthez/tokenization_barthez.py @@ -90,8 +90,9 @@ class BarthezTokenizer(PreTrainedTokenizer): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. - Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every - conversion (string, tokens and IDs). + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/barthez/tokenization_barthez_fast.py b/src/transformers/models/barthez/tokenization_barthez_fast.py index 2d669ff8db..070d6e6c7e 100644 --- a/src/transformers/models/barthez/tokenization_barthez_fast.py +++ b/src/transformers/models/barthez/tokenization_barthez_fast.py @@ -98,9 +98,6 @@ class BarthezTokenizerFast(PreTrainedTokenizerFast): modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. - - Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every - conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/blenderbot/tokenization_blenderbot.py b/src/transformers/models/blenderbot/tokenization_blenderbot.py index c502f73e8c..725f31605d 100644 --- a/src/transformers/models/blenderbot/tokenization_blenderbot.py +++ b/src/transformers/models/blenderbot/tokenization_blenderbot.py @@ -36,7 +36,7 @@ class BlenderbotTokenizer(RobertaTokenizer): Construct a Blenderbot tokenizer. :class:`~transformers.Blenderbot` is nearly identical to :class:`~transformers.RobertaTokenizer` and runs - end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesnt add BOS token + end-to-end tokenization: punctuation splitting and wordpiece. The only difference is that it doesn't add BOS token to the beginning of sequences. Refer to superclass :class:`~transformers.RobertaTokenizer` for usage examples and documentation concerning diff --git a/src/transformers/models/camembert/tokenization_camembert.py b/src/transformers/models/camembert/tokenization_camembert.py index b119292cef..0be12e76be 100644 --- a/src/transformers/models/camembert/tokenization_camembert.py +++ b/src/transformers/models/camembert/tokenization_camembert.py @@ -93,8 +93,9 @@ class CamembertTokenizer(PreTrainedTokenizer): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. - Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every - conversion (string, tokens and IDs). + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/camembert/tokenization_camembert_fast.py b/src/transformers/models/camembert/tokenization_camembert_fast.py index 813eedcdc3..437fa77173 100644 --- a/src/transformers/models/camembert/tokenization_camembert_fast.py +++ b/src/transformers/models/camembert/tokenization_camembert_fast.py @@ -101,10 +101,6 @@ class CamembertTokenizerFast(PreTrainedTokenizerFast): modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. - - Attributes: - sp_model (:obj:`SentencePieceProcessor`): - The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/t5/tokenization_t5.py b/src/transformers/models/t5/tokenization_t5.py index 39d0783f4c..4dcd51d494 100644 --- a/src/transformers/models/t5/tokenization_t5.py +++ b/src/transformers/models/t5/tokenization_t5.py @@ -92,6 +92,10 @@ class T5Tokenizer(PreTrainedTokenizer): `__). additional_special_tokens (:obj:`List[str]`, `optional`): Additional special tokens used by the tokenizer. + + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py index b3ec309968..c621070522 100644 --- a/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py +++ b/src/transformers/models/xlm_prophetnet/tokenization_xlm_prophetnet.py @@ -97,8 +97,9 @@ class XLMProphetNetTokenizer(PreTrainedTokenizer): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. - Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every - conversion (string, tokens and IDs). + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py index b7a00494ea..2d85e30dd0 100644 --- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py +++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta.py @@ -95,8 +95,9 @@ class XLMRobertaTokenizer(PreTrainedTokenizer): additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. - Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every - conversion (string, tokens and IDs). + Attributes: + sp_model (:obj:`SentencePieceProcessor`): + The `SentencePiece` processor that is used for every conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py index 902aa2d274..befd84be94 100644 --- a/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py +++ b/src/transformers/models/xlm_roberta/tokenization_xlm_roberta_fast.py @@ -106,9 +106,6 @@ class XLMRobertaTokenizerFast(PreTrainedTokenizerFast): modeling. This is the token which the model will try to predict. additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["NOTUSED", "NOTUSED"]`): Additional special tokens used by the tokenizer. - - Attributes: sp_model (:obj:`SentencePieceProcessor`): The `SentencePiece` processor that is used for every - conversion (string, tokens and IDs). """ vocab_files_names = VOCAB_FILES_NAMES diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 4c801f9144..425942668c 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1793,12 +1793,11 @@ class PreTrainedTokenizerBase(SpecialTokensMixin): def _from_pretrained( cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, *init_inputs, **kwargs ): - # We instantiate fast tokenizers based on a slow tokenizer for now - # In the future we can also use a direct way based on saving/instantiating - # tokenizer's Tokenizer directly from it's serialization JSON - if ( - "tokenizer_file" not in resolved_vocab_files or resolved_vocab_files["tokenizer_file"] is None - ) and cls.slow_tokenizer_class is not None: + # We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json + # file or if `from_slow` is set to True. + from_slow = kwargs.get("from_slow", False) + has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None + if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None: slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained( copy.deepcopy(resolved_vocab_files), pretrained_model_name_or_path, diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index 92388507d2..4ee82a4552 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -80,8 +80,15 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): def __init__(self, *args, **kwargs): slow_tokenizer = kwargs.pop("__slow_tokenizer", None) fast_tokenizer_file = kwargs.pop("tokenizer_file", None) + from_slow = kwargs.pop("from_slow", False) - if fast_tokenizer_file is not None: + if from_slow and slow_tokenizer is None and self.slow_tokenizer_class is None: + raise ValueError( + "Cannot instantiate this tokenizer from a slow version. If it's based on sentencepiece, make sure you " + "have sentencepiece installed." + ) + + if fast_tokenizer_file is not None and not from_slow: # We have a serialization from tokenizers which let us directly build the backend fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file) elif slow_tokenizer is not None: