Fix properties of unset special tokens in non verbose mode (#17797)

Co-authored-by: SaulLu <55560583+SaulLu@users.noreply.github.com>
This commit is contained in:
Guillaume Klein
2022-06-23 14:40:13 +02:00
committed by GitHub
parent b2fdbaccdd
commit 3eed5530ec
6 changed files with 56 additions and 24 deletions

View File

@@ -229,8 +229,9 @@ class BartTokenizerFast(PreTrainedTokenizerFast):
BART tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily BART tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the *<mask>*. comprise the space before the *<mask>*.
""" """
if self._mask_token is None and self.verbose: if self._mask_token is None:
logger.error("Using mask_token, but it is not set yet.") if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None return None
return str(self._mask_token) return str(self._mask_token)

View File

@@ -139,8 +139,9 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
comprise the space before the *[MASK]*. comprise the space before the *[MASK]*.
""" """
if self._mask_token is None and self.verbose: if self._mask_token is None:
logger.error("Using mask_token, but it is not set yet.") if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None return None
return str(self._mask_token) return str(self._mask_token)

View File

@@ -163,8 +163,9 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the *<mask>*. comprise the space before the *<mask>*.
""" """
if self._mask_token is None and self.verbose: if self._mask_token is None:
logger.error("Using mask_token, but it is not set yet.") if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None return None
return str(self._mask_token) return str(self._mask_token)

View File

@@ -235,8 +235,9 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast):
Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
comprise the space before the *<mask>*. comprise the space before the *<mask>*.
""" """
if self._mask_token is None and self.verbose: if self._mask_token is None:
logger.error("Using mask_token, but it is not set yet.") if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None return None
return str(self._mask_token) return str(self._mask_token)

View File

@@ -968,8 +968,9 @@ class SpecialTokensMixin:
""" """
`str`: Beginning of sentence token. Log an error if used while not having been set. `str`: Beginning of sentence token. Log an error if used while not having been set.
""" """
if self._bos_token is None and self.verbose: if self._bos_token is None:
logger.error("Using bos_token, but it is not set yet.") if self.verbose:
logger.error("Using bos_token, but it is not set yet.")
return None return None
return str(self._bos_token) return str(self._bos_token)
@@ -978,8 +979,9 @@ class SpecialTokensMixin:
""" """
`str`: End of sentence token. Log an error if used while not having been set. `str`: End of sentence token. Log an error if used while not having been set.
""" """
if self._eos_token is None and self.verbose: if self._eos_token is None:
logger.error("Using eos_token, but it is not set yet.") if self.verbose:
logger.error("Using eos_token, but it is not set yet.")
return None return None
return str(self._eos_token) return str(self._eos_token)
@@ -988,8 +990,9 @@ class SpecialTokensMixin:
""" """
`str`: Unknown token. Log an error if used while not having been set. `str`: Unknown token. Log an error if used while not having been set.
""" """
if self._unk_token is None and self.verbose: if self._unk_token is None:
logger.error("Using unk_token, but it is not set yet.") if self.verbose:
logger.error("Using unk_token, but it is not set yet.")
return None return None
return str(self._unk_token) return str(self._unk_token)
@@ -999,8 +1002,9 @@ class SpecialTokensMixin:
`str`: Separation token, to separate context and query in an input sequence. Log an error if used while not `str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
having been set. having been set.
""" """
if self._sep_token is None and self.verbose: if self._sep_token is None:
logger.error("Using sep_token, but it is not set yet.") if self.verbose:
logger.error("Using sep_token, but it is not set yet.")
return None return None
return str(self._sep_token) return str(self._sep_token)
@@ -1009,8 +1013,9 @@ class SpecialTokensMixin:
""" """
`str`: Padding token. Log an error if used while not having been set. `str`: Padding token. Log an error if used while not having been set.
""" """
if self._pad_token is None and self.verbose: if self._pad_token is None:
logger.error("Using pad_token, but it is not set yet.") if self.verbose:
logger.error("Using pad_token, but it is not set yet.")
return None return None
return str(self._pad_token) return str(self._pad_token)
@@ -1020,8 +1025,9 @@ class SpecialTokensMixin:
`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full `str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
depth of the model. Log an error if used while not having been set. depth of the model. Log an error if used while not having been set.
""" """
if self._cls_token is None and self.verbose: if self._cls_token is None:
logger.error("Using cls_token, but it is not set yet.") if self.verbose:
logger.error("Using cls_token, but it is not set yet.")
return None return None
return str(self._cls_token) return str(self._cls_token)
@@ -1031,8 +1037,9 @@ class SpecialTokensMixin:
`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not `str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
having been set. having been set.
""" """
if self._mask_token is None and self.verbose: if self._mask_token is None:
logger.error("Using mask_token, but it is not set yet.") if self.verbose:
logger.error("Using mask_token, but it is not set yet.")
return None return None
return str(self._mask_token) return str(self._mask_token)
@@ -1042,8 +1049,9 @@ class SpecialTokensMixin:
`List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been `List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been
set. set.
""" """
if self._additional_special_tokens is None and self.verbose: if self._additional_special_tokens is None:
logger.error("Using additional_special_tokens, but it is not set yet.") if self.verbose:
logger.error("Using additional_special_tokens, but it is not set yet.")
return None return None
return [str(tok) for tok in self._additional_special_tokens] return [str(tok) for tok in self._additional_special_tokens]

View File

@@ -31,6 +31,7 @@ from pathlib import Path
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token
from parameterized import parameterized
from requests.exceptions import HTTPError from requests.exceptions import HTTPError
from transformers import ( from transformers import (
AlbertTokenizer, AlbertTokenizer,
@@ -578,6 +579,25 @@ class TokenizerTesterMixin:
self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [token_to_test_setters]) self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [token_to_test_setters])
self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [token_id_to_test_setters]) self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [token_id_to_test_setters])
@parameterized.expand([(True,), (False,)])
def test_tokenizers_special_tokens_properties_unset(self, verbose):
tokenizers = self.get_tokenizers(verbose=verbose)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
attributes_list = [
"bos_token",
"eos_token",
"unk_token",
"sep_token",
"pad_token",
"cls_token",
"mask_token",
"additional_special_tokens",
]
for attr in attributes_list:
setattr(tokenizer, attr, None)
self.assertIsNone(getattr(tokenizer, attr))
def test_save_and_load_tokenizer(self): def test_save_and_load_tokenizer(self):
# safety check on max_len default value so we are sure the test works # safety check on max_len default value so we are sure the test works
tokenizers = self.get_tokenizers() tokenizers = self.get_tokenizers()