Fix properties of unset special tokens in non verbose mode (#17797)
Co-authored-by: SaulLu <55560583+SaulLu@users.noreply.github.com>
This commit is contained in:
@@ -229,7 +229,8 @@ class BartTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
BART tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
|
BART tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
|
||||||
comprise the space before the *<mask>*.
|
comprise the space before the *<mask>*.
|
||||||
"""
|
"""
|
||||||
if self._mask_token is None and self.verbose:
|
if self._mask_token is None:
|
||||||
|
if self.verbose:
|
||||||
logger.error("Using mask_token, but it is not set yet.")
|
logger.error("Using mask_token, but it is not set yet.")
|
||||||
return None
|
return None
|
||||||
return str(self._mask_token)
|
return str(self._mask_token)
|
||||||
|
|||||||
@@ -139,7 +139,8 @@ class DebertaTokenizerFast(GPT2TokenizerFast):
|
|||||||
Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
|
Deberta tokenizer has a special mask token to be used in the fill-mask pipeline. The mask token will greedily
|
||||||
comprise the space before the *[MASK]*.
|
comprise the space before the *[MASK]*.
|
||||||
"""
|
"""
|
||||||
if self._mask_token is None and self.verbose:
|
if self._mask_token is None:
|
||||||
|
if self.verbose:
|
||||||
logger.error("Using mask_token, but it is not set yet.")
|
logger.error("Using mask_token, but it is not set yet.")
|
||||||
return None
|
return None
|
||||||
return str(self._mask_token)
|
return str(self._mask_token)
|
||||||
|
|||||||
@@ -163,7 +163,8 @@ class MPNetTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
|
MPNet tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
|
||||||
comprise the space before the *<mask>*.
|
comprise the space before the *<mask>*.
|
||||||
"""
|
"""
|
||||||
if self._mask_token is None and self.verbose:
|
if self._mask_token is None:
|
||||||
|
if self.verbose:
|
||||||
logger.error("Using mask_token, but it is not set yet.")
|
logger.error("Using mask_token, but it is not set yet.")
|
||||||
return None
|
return None
|
||||||
return str(self._mask_token)
|
return str(self._mask_token)
|
||||||
|
|||||||
@@ -235,7 +235,8 @@ class RobertaTokenizerFast(PreTrainedTokenizerFast):
|
|||||||
Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
|
Roberta tokenizer has a special mask token to be usable in the fill-mask pipeline. The mask token will greedily
|
||||||
comprise the space before the *<mask>*.
|
comprise the space before the *<mask>*.
|
||||||
"""
|
"""
|
||||||
if self._mask_token is None and self.verbose:
|
if self._mask_token is None:
|
||||||
|
if self.verbose:
|
||||||
logger.error("Using mask_token, but it is not set yet.")
|
logger.error("Using mask_token, but it is not set yet.")
|
||||||
return None
|
return None
|
||||||
return str(self._mask_token)
|
return str(self._mask_token)
|
||||||
|
|||||||
@@ -968,7 +968,8 @@ class SpecialTokensMixin:
|
|||||||
"""
|
"""
|
||||||
`str`: Beginning of sentence token. Log an error if used while not having been set.
|
`str`: Beginning of sentence token. Log an error if used while not having been set.
|
||||||
"""
|
"""
|
||||||
if self._bos_token is None and self.verbose:
|
if self._bos_token is None:
|
||||||
|
if self.verbose:
|
||||||
logger.error("Using bos_token, but it is not set yet.")
|
logger.error("Using bos_token, but it is not set yet.")
|
||||||
return None
|
return None
|
||||||
return str(self._bos_token)
|
return str(self._bos_token)
|
||||||
@@ -978,7 +979,8 @@ class SpecialTokensMixin:
|
|||||||
"""
|
"""
|
||||||
`str`: End of sentence token. Log an error if used while not having been set.
|
`str`: End of sentence token. Log an error if used while not having been set.
|
||||||
"""
|
"""
|
||||||
if self._eos_token is None and self.verbose:
|
if self._eos_token is None:
|
||||||
|
if self.verbose:
|
||||||
logger.error("Using eos_token, but it is not set yet.")
|
logger.error("Using eos_token, but it is not set yet.")
|
||||||
return None
|
return None
|
||||||
return str(self._eos_token)
|
return str(self._eos_token)
|
||||||
@@ -988,7 +990,8 @@ class SpecialTokensMixin:
|
|||||||
"""
|
"""
|
||||||
`str`: Unknown token. Log an error if used while not having been set.
|
`str`: Unknown token. Log an error if used while not having been set.
|
||||||
"""
|
"""
|
||||||
if self._unk_token is None and self.verbose:
|
if self._unk_token is None:
|
||||||
|
if self.verbose:
|
||||||
logger.error("Using unk_token, but it is not set yet.")
|
logger.error("Using unk_token, but it is not set yet.")
|
||||||
return None
|
return None
|
||||||
return str(self._unk_token)
|
return str(self._unk_token)
|
||||||
@@ -999,7 +1002,8 @@ class SpecialTokensMixin:
|
|||||||
`str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
|
`str`: Separation token, to separate context and query in an input sequence. Log an error if used while not
|
||||||
having been set.
|
having been set.
|
||||||
"""
|
"""
|
||||||
if self._sep_token is None and self.verbose:
|
if self._sep_token is None:
|
||||||
|
if self.verbose:
|
||||||
logger.error("Using sep_token, but it is not set yet.")
|
logger.error("Using sep_token, but it is not set yet.")
|
||||||
return None
|
return None
|
||||||
return str(self._sep_token)
|
return str(self._sep_token)
|
||||||
@@ -1009,7 +1013,8 @@ class SpecialTokensMixin:
|
|||||||
"""
|
"""
|
||||||
`str`: Padding token. Log an error if used while not having been set.
|
`str`: Padding token. Log an error if used while not having been set.
|
||||||
"""
|
"""
|
||||||
if self._pad_token is None and self.verbose:
|
if self._pad_token is None:
|
||||||
|
if self.verbose:
|
||||||
logger.error("Using pad_token, but it is not set yet.")
|
logger.error("Using pad_token, but it is not set yet.")
|
||||||
return None
|
return None
|
||||||
return str(self._pad_token)
|
return str(self._pad_token)
|
||||||
@@ -1020,7 +1025,8 @@ class SpecialTokensMixin:
|
|||||||
`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
|
`str`: Classification token, to extract a summary of an input sequence leveraging self-attention along the full
|
||||||
depth of the model. Log an error if used while not having been set.
|
depth of the model. Log an error if used while not having been set.
|
||||||
"""
|
"""
|
||||||
if self._cls_token is None and self.verbose:
|
if self._cls_token is None:
|
||||||
|
if self.verbose:
|
||||||
logger.error("Using cls_token, but it is not set yet.")
|
logger.error("Using cls_token, but it is not set yet.")
|
||||||
return None
|
return None
|
||||||
return str(self._cls_token)
|
return str(self._cls_token)
|
||||||
@@ -1031,7 +1037,8 @@ class SpecialTokensMixin:
|
|||||||
`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
|
`str`: Mask token, to use when training a model with masked-language modeling. Log an error if used while not
|
||||||
having been set.
|
having been set.
|
||||||
"""
|
"""
|
||||||
if self._mask_token is None and self.verbose:
|
if self._mask_token is None:
|
||||||
|
if self.verbose:
|
||||||
logger.error("Using mask_token, but it is not set yet.")
|
logger.error("Using mask_token, but it is not set yet.")
|
||||||
return None
|
return None
|
||||||
return str(self._mask_token)
|
return str(self._mask_token)
|
||||||
@@ -1042,7 +1049,8 @@ class SpecialTokensMixin:
|
|||||||
`List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been
|
`List[str]`: All the additional special tokens you may want to use. Log an error if used while not having been
|
||||||
set.
|
set.
|
||||||
"""
|
"""
|
||||||
if self._additional_special_tokens is None and self.verbose:
|
if self._additional_special_tokens is None:
|
||||||
|
if self.verbose:
|
||||||
logger.error("Using additional_special_tokens, but it is not set yet.")
|
logger.error("Using additional_special_tokens, but it is not set yet.")
|
||||||
return None
|
return None
|
||||||
return [str(tok) for tok in self._additional_special_tokens]
|
return [str(tok) for tok in self._additional_special_tokens]
|
||||||
|
|||||||
@@ -31,6 +31,7 @@ from pathlib import Path
|
|||||||
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
|
from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Union
|
||||||
|
|
||||||
from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token
|
from huggingface_hub import HfFolder, Repository, delete_repo, set_access_token
|
||||||
|
from parameterized import parameterized
|
||||||
from requests.exceptions import HTTPError
|
from requests.exceptions import HTTPError
|
||||||
from transformers import (
|
from transformers import (
|
||||||
AlbertTokenizer,
|
AlbertTokenizer,
|
||||||
@@ -578,6 +579,25 @@ class TokenizerTesterMixin:
|
|||||||
self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [token_to_test_setters])
|
self.assertListEqual(getattr(tokenizer, "additional_special_tokens"), [token_to_test_setters])
|
||||||
self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [token_id_to_test_setters])
|
self.assertListEqual(getattr(tokenizer, "additional_special_tokens_ids"), [token_id_to_test_setters])
|
||||||
|
|
||||||
|
@parameterized.expand([(True,), (False,)])
|
||||||
|
def test_tokenizers_special_tokens_properties_unset(self, verbose):
|
||||||
|
tokenizers = self.get_tokenizers(verbose=verbose)
|
||||||
|
for tokenizer in tokenizers:
|
||||||
|
with self.subTest(f"{tokenizer.__class__.__name__}"):
|
||||||
|
attributes_list = [
|
||||||
|
"bos_token",
|
||||||
|
"eos_token",
|
||||||
|
"unk_token",
|
||||||
|
"sep_token",
|
||||||
|
"pad_token",
|
||||||
|
"cls_token",
|
||||||
|
"mask_token",
|
||||||
|
"additional_special_tokens",
|
||||||
|
]
|
||||||
|
for attr in attributes_list:
|
||||||
|
setattr(tokenizer, attr, None)
|
||||||
|
self.assertIsNone(getattr(tokenizer, attr))
|
||||||
|
|
||||||
def test_save_and_load_tokenizer(self):
|
def test_save_and_load_tokenizer(self):
|
||||||
# safety check on max_len default value so we are sure the test works
|
# safety check on max_len default value so we are sure the test works
|
||||||
tokenizers = self.get_tokenizers()
|
tokenizers = self.get_tokenizers()
|
||||||
|
|||||||
Reference in New Issue
Block a user