From 3eed5530ec74bb60ad9f8f612717d0f6ccf820f2 Mon Sep 17 00:00:00 2001 From: Guillaume Klein Date: Thu, 23 Jun 2022 14:40:13 +0200 Subject: [PATCH] Fix properties of unset special tokens in non verbose mode (#17797) Co-authored-by: SaulLu <55560583+SaulLu@users.noreply.github.com> --- .../models/bart/tokenization_bart_fast.py | 5 ++- .../deberta/tokenization_deberta_fast.py | 5 ++- .../models/mpnet/tokenization_mpnet_fast.py | 5 ++- .../roberta/tokenization_roberta_fast.py | 5 ++- src/transformers/tokenization_utils_base.py | 40 +++++++++++-------- tests/test_tokenization_common.py | 20 ++++++++++ 6 files changed, 56 insertions(+), 24 deletions(-) diff --git a/src/transformers/models/bart/tokenization_bart_fast.py b/src/transformers/models/bart/tokenization_bart_fast.py index a7c86ea676..26c546be00 100644 --- a/src/transformers/models/bart/tokenization_bart_fast.py +++ b/src/transformers/models/bart/tokenization_bart_fast.py @@ -229,8 +229,9 @@ class BartTokenizerFast(PreTrainedTokenizerFast): BART tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily comprise the space before the **. """ - if self._mask_token is None and self.verbose: - logger.error("Using mask_token, but it is not set yet.") + if self._mask_token is None: + if self.verbose: + logger.error("Using mask_token, but it is not set yet.") return None return str(self._mask_token) diff --git a/src/transformers/models/deberta/tokenization_deberta_fast.py b/src/transformers/models/deberta/tokenization_deberta_fast.py index 5b3852a6ed..5252c0c45d 100644 --- a/src/transformers/models/deberta/tokenization_deberta_fast.py +++ b/src/transformers/models/deberta/tokenization_deberta_fast.py @@ -139,8 +139,9 @@ class DebertaTokenizerFast(GPT2TokenizerFast): Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily comprise the space before the *[MASK]*. """ - if self._mask_token is None and self.verbose: - logger.error("Using mask_token, but it is not set yet.") + if self._mask_token is None: + if self.verbose: + logger.error("Using mask_token, but it is not set yet.") return None return str(self._mask_token) diff --git a/src/transformers/models/mpnet/tokenization_mpnet_fast.py b/src/transformers/models/mpnet/tokenization_mpnet_fast.py index c913f85682..f2fe4fe4fe 100644 --- a/src/transformers/models/mpnet/tokenization_mpnet_fast.py +++ b/src/transformers/models/mpnet/tokenization_mpnet_fast.py @@ -163,8 +163,9 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast): MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily comprise the space before the **. """ - if self._mask_token is None and self.verbose: - logger.error("Using mask_token, but it is not set yet.") + if self._mask_token is None: + if self.verbose: + logger.error("Using mask_token, but it is not set yet.") return None return str(self._mask_token) diff --git a/src/transformers/models/roberta/tokenization_roberta_fast.py b/src/transformers/models/roberta/tokenization_roberta_fast.py index cb055430b1..29381404c4 100644 --- a/src/transformers/models/roberta/tokenization_roberta_fast.py +++ b/src/transformers/models/roberta/tokenization_roberta_fast.py @@ -235,8 +235,9 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast): Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily comprise the space before the **. """ - if self._mask_token is None and self.verbose: - logger.error("Using mask_token, but it is not set yet.") + if self._mask_token is None: + if self.verbose: + logger.error("Using mask_token, but it is not set yet.") return None return str(self._mask_token) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 15500bee3e..96ee9c6eee 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -968,8 +968,9 @@ class SpecialTokensMixin: """ `str`: Beginning of sentence token. Log an error if used while not having been set. """ - if self._bos_token is None and self.verbose: - logger.error("Using bos_token, but it is not set yet.") + if self._bos_token is None: + if self.verbose: + logger.error("Using bos_token, but it is not set yet.") return None return str(self._bos_token) @@ -978,8 +979,9 @@ class SpecialTokensMixin: """ `str`: End of sentence token. Log an error if used while not having been set. """ - if self._eos_token is None and self.verbose: - logger.error("Using eos_token, but it is not set yet.") + if self._eos_token is None: + if self.verbose: + logger.error("Using eos_token, but it is not set yet.") return None return str(self._eos_token) @@ -988,8 +990,9 @@ class SpecialTokensMixin: """ `str`: Unknown token. Log an error if used while not having been set. """ - if self._unk_token is None and self.verbose: - logger.error("Using unk_token, but it is not set yet.") + if self._unk_token is None: + if self.verbose: + logger.error("Using unk_token, but it is not set yet.") return None return str(self._unk_token) @@ -999,8 +1002,9 @@ class SpecialTokensMixin: `str`: Separation token, to separate context and query in an input sequence. Log an error if used while not having been set. """ - if self._sep_token is None and self.verbose: - logger.error("Using sep_token, but it is not set yet.") + if self._sep_token is None: + if self.verbose: + logger.error("Using sep_token, but it is not set yet.") return None return str(self._sep_token) @@ -1009,8 +1013,9 @@ class SpecialTokensMixin: """ `str`: Padding token. Log an error if used while not having been set. """ - if self._pad_token is None and self.verbose: - logger.error("Using pad_token, but it is not set yet.") + if self._pad_token is None: + if self.verbose: + logger.error("Using pad_token, but it is not set yet.") return None return str(self._pad_token) @@ -1020,8 +1025,9 @@ class SpecialTokensMixin: `str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full depth of the model. Log an error if used while not having been set. """ - if self._cls_token is None and self.verbose: - logger.error("Using cls_token, but it is not set yet.") + if self._cls_token is None: + if self.verbose: + logger.error("Using cls_token, but it is not set yet.") return None return str(self._cls_token) @@ -1031,8 +1037,9 @@ class SpecialTokensMixin: `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not having been set. """ - if self._mask_token is None and self.verbose: - logger.error("Using mask_token, but it is not set yet.") + if self._mask_token is None: + if self.verbose: + logger.error("Using mask_token, but it is not set yet.") return None return str(self._mask_token) @@ -1042,8 +1049,9 @@ class SpecialTokensMixin: `List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been set. """ - if self._additional_special_tokens is None and self.verbose: - logger.error("Using additional_special_tokens, but it is not set yet.") + if self._additional_special_tokens is None: + if self.verbose: + logger.error("Using additional_special_tokens, but it is not set yet.") return None return [str(tok) for tok in self._additional_special_tokens] diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index 4b27c0edb7..2abff6bda9 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -31,6 +31,7 @@ from pathlib import Path from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token +from parameterized import parameterized from requests.exceptions import HTTPError from transformers import ( AlbertTokenizer, @@ -578,6 +579,25 @@ class TokenizerTesterMixin: self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [token_to_test_setters]) self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [token_id_to_test_setters]) + @parameterized.expand([(True,), (False,)]) + def test_tokenizers_special_tokens_properties_unset(self, verbose): + tokenizers = self.get_tokenizers(verbose=verbose) + for tokenizer in tokenizers: + with self.subTest(f"{tokenizer.__class__.__name__}"): + attributes_list = [ + "bos_token", + "eos_token", + "unk_token", + "sep_token", + "pad_token", + "cls_token", + "mask_token", + "additional_special_tokens", + ] + for attr in attributes_list: + setattr(tokenizer, attr, None) + self.assertIsNone(getattr(tokenizer, attr)) + def test_save_and_load_tokenizer(self): # safety check on max_len default value so we are sure the test works tokenizers = self.get_tokenizers()