clean_up_tokenization_spaces=False if unset (#31938)

* clean_up_tokenization_spaces=False if unset

* deprecate warning

* updating param for old models

* update models

* make fix-copies

* fix-copies and update bert models

* warning msg

* update prophet and clvp

* updating test since space before is arbitrarily removed

* remove warning for 4.45
This commit is contained in:
Ita Zaporozhets
2024-09-26 13:38:20 -04:00
committed by GitHub
parent 3557f9a14a
commit 6730485b02
16 changed files with 67 additions and 7 deletions

View File

@@ -88,6 +88,9 @@ class BertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -105,6 +108,7 @@ class BertTokenizer(PreTrainedTokenizer):
mask_token="[MASK]", mask_token="[MASK]",
tokenize_chinese_chars=True, tokenize_chinese_chars=True,
strip_accents=None, strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs, **kwargs,
): ):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
@@ -136,6 +140,7 @@ class BertTokenizer(PreTrainedTokenizer):
mask_token=mask_token, mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars, tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents, strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs, **kwargs,
) )

View File

@@ -91,6 +91,9 @@ class ConvBertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original ConvBERT). value for `lowercase` (as in the original ConvBERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -108,6 +111,7 @@ class ConvBertTokenizer(PreTrainedTokenizer):
mask_token="[MASK]", mask_token="[MASK]",
tokenize_chinese_chars=True, tokenize_chinese_chars=True,
strip_accents=None, strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs, **kwargs,
): ):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
@@ -139,6 +143,7 @@ class ConvBertTokenizer(PreTrainedTokenizer):
mask_token=mask_token, mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars, tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents, strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs, **kwargs,
) )

View File

@@ -90,6 +90,9 @@ class DistilBertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -108,6 +111,7 @@ class DistilBertTokenizer(PreTrainedTokenizer):
mask_token="[MASK]", mask_token="[MASK]",
tokenize_chinese_chars=True, tokenize_chinese_chars=True,
strip_accents=None, strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs, **kwargs,
): ):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
@@ -138,6 +142,7 @@ class DistilBertTokenizer(PreTrainedTokenizer):
mask_token=mask_token, mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars, tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents, strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs, **kwargs,
) )

View File

@@ -90,6 +90,9 @@ class ElectraTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original Electra). value for `lowercase` (as in the original Electra).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -107,6 +110,7 @@ class ElectraTokenizer(PreTrainedTokenizer):
mask_token="[MASK]", mask_token="[MASK]",
tokenize_chinese_chars=True, tokenize_chinese_chars=True,
strip_accents=None, strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs, **kwargs,
): ):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
@@ -138,6 +142,7 @@ class ElectraTokenizer(PreTrainedTokenizer):
mask_token=mask_token, mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars, tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents, strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs, **kwargs,
) )

View File

@@ -107,6 +107,9 @@ class FunnelTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -127,6 +130,7 @@ class FunnelTokenizer(PreTrainedTokenizer):
eos_token="</s>", eos_token="</s>",
tokenize_chinese_chars=True, tokenize_chinese_chars=True,
strip_accents=None, strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs, **kwargs,
): ):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
@@ -159,6 +163,7 @@ class FunnelTokenizer(PreTrainedTokenizer):
eos_token=eos_token, eos_token=eos_token,
tokenize_chinese_chars=tokenize_chinese_chars, tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents, strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs, **kwargs,
) )

View File

@@ -91,6 +91,9 @@ class LayoutLMTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original LayoutLM). value for `lowercase` (as in the original LayoutLM).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -108,6 +111,7 @@ class LayoutLMTokenizer(PreTrainedTokenizer):
mask_token="[MASK]", mask_token="[MASK]",
tokenize_chinese_chars=True, tokenize_chinese_chars=True,
strip_accents=None, strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs, **kwargs,
): ):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
@@ -139,6 +143,7 @@ class LayoutLMTokenizer(PreTrainedTokenizer):
mask_token=mask_token, mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars, tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents, strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs, **kwargs,
) )

View File

@@ -90,6 +90,9 @@ class LxmertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original Lxmert). value for `lowercase` (as in the original Lxmert).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -107,6 +110,7 @@ class LxmertTokenizer(PreTrainedTokenizer):
mask_token="[MASK]", mask_token="[MASK]",
tokenize_chinese_chars=True, tokenize_chinese_chars=True,
strip_accents=None, strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs, **kwargs,
): ):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
@@ -138,6 +142,7 @@ class LxmertTokenizer(PreTrainedTokenizer):
mask_token=mask_token, mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars, tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents, strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs, **kwargs,
) )

View File

@@ -92,6 +92,9 @@ class MobileBertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original MobileBERT). value for `lowercase` (as in the original MobileBERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -109,6 +112,7 @@ class MobileBertTokenizer(PreTrainedTokenizer):
mask_token="[MASK]", mask_token="[MASK]",
tokenize_chinese_chars=True, tokenize_chinese_chars=True,
strip_accents=None, strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs, **kwargs,
): ):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
@@ -140,6 +144,7 @@ class MobileBertTokenizer(PreTrainedTokenizer):
mask_token=mask_token, mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars, tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents, strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs, **kwargs,
) )

View File

@@ -108,6 +108,9 @@ class MPNetTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -128,6 +131,7 @@ class MPNetTokenizer(PreTrainedTokenizer):
mask_token="<mask>", mask_token="<mask>",
tokenize_chinese_chars=True, tokenize_chinese_chars=True,
strip_accents=None, strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs, **kwargs,
): ):
bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
@@ -170,6 +174,7 @@ class MPNetTokenizer(PreTrainedTokenizer):
mask_token=mask_token, mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars, tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents, strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs, **kwargs,
) )

View File

@@ -308,6 +308,9 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original BERT). value for `lowercase` (as in the original BERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -330,6 +333,7 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
mask_token: Optional[str] = "[MASK]", mask_token: Optional[str] = "[MASK]",
tokenize_chinese_chars: Optional[bool] = True, tokenize_chinese_chars: Optional[bool] = True,
strip_accents: Optional[bool] = None, strip_accents: Optional[bool] = None,
clean_up_tokenization_spaces: bool = True,
**kwargs, **kwargs,
): ):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
@@ -360,6 +364,7 @@ class ProphetNetTokenizer(PreTrainedTokenizer):
mask_token=mask_token, mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars, tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents, strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs, **kwargs,
) )

View File

@@ -91,6 +91,9 @@ class SqueezeBertTokenizer(PreTrainedTokenizer):
strip_accents (`bool`, *optional*): strip_accents (`bool`, *optional*):
Whether or not to strip all accents. If this option is not specified, then it will be determined by the Whether or not to strip all accents. If this option is not specified, then it will be determined by the
value for `lowercase` (as in the original SqueezeBERT). value for `lowercase` (as in the original SqueezeBERT).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -108,6 +111,7 @@ class SqueezeBertTokenizer(PreTrainedTokenizer):
mask_token="[MASK]", mask_token="[MASK]",
tokenize_chinese_chars=True, tokenize_chinese_chars=True,
strip_accents=None, strip_accents=None,
clean_up_tokenization_spaces=True,
**kwargs, **kwargs,
): ):
if not os.path.isfile(vocab_file): if not os.path.isfile(vocab_file):
@@ -139,6 +143,7 @@ class SqueezeBertTokenizer(PreTrainedTokenizer):
mask_token=mask_token, mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars, tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents, strip_accents=strip_accents,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs, **kwargs,
) )

View File

@@ -225,6 +225,9 @@ class TapasTokenizer(PreTrainedTokenizer):
Minimum length of each question in terms of tokens (will be skipped otherwise). Minimum length of each question in terms of tokens (will be skipped otherwise).
max_question_length (`int`, *optional*): max_question_length (`int`, *optional*):
Maximum length of each question in terms of tokens (will be skipped otherwise). Maximum length of each question in terms of tokens (will be skipped otherwise).
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
extra spaces.
""" """
vocab_files_names = VOCAB_FILES_NAMES vocab_files_names = VOCAB_FILES_NAMES
@@ -252,6 +255,7 @@ class TapasTokenizer(PreTrainedTokenizer):
max_question_length=None, max_question_length=None,
model_max_length: int = 512, model_max_length: int = 512,
additional_special_tokens: Optional[List[str]] = None, additional_special_tokens: Optional[List[str]] = None,
clean_up_tokenization_spaces=True,
**kwargs, **kwargs,
): ):
if not is_pandas_available(): if not is_pandas_available():
@@ -322,6 +326,7 @@ class TapasTokenizer(PreTrainedTokenizer):
max_question_length=max_question_length, max_question_length=max_question_length,
model_max_length=model_max_length, model_max_length=model_max_length,
additional_special_tokens=additional_special_tokens, additional_special_tokens=additional_special_tokens,
clean_up_tokenization_spaces=clean_up_tokenization_spaces,
**kwargs, **kwargs,
) )

View File

@@ -1622,7 +1622,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
) )
# By default, cleaning tokenization spaces for both fast and slow tokenizers # By default, cleaning tokenization spaces for both fast and slow tokenizers
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True) self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False)
# By default, do not split special tokens for both fast and slow tokenizers # By default, do not split special tokens for both fast and slow tokenizers
self.split_special_tokens = kwargs.pop("split_special_tokens", False) self.split_special_tokens = kwargs.pop("split_special_tokens", False)

View File

@@ -79,7 +79,7 @@ class ClvpTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
# Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts # Copied from transformers.tests.models.gpt2.test_tokenization_gpt2.GPT2TokenizationTest.get_input_output_texts
def get_input_output_texts(self, tokenizer): def get_input_output_texts(self, tokenizer):
input_text = "lower newer" input_text = "lower newer"
output_text = "lower newer" output_text = "lower[SPACE]newer"
return input_text, output_text return input_text, output_text
# Copied from transformers.tests.models.layoutxlm.test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_add_special_tokens # Copied from transformers.tests.models.layoutxlm.test_tokenization_layoutxlm.LayoutXLMTokenizationTest.test_add_special_tokens

View File

@@ -147,8 +147,8 @@ class Wav2Vec2TokenizerTest(unittest.TestCase):
batch_tokens = tokenizer.batch_decode(sample_ids) batch_tokens = tokenizer.batch_decode(sample_ids)
batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True) batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True)
self.assertEqual(batch_tokens, ["HELLO<unk>!?!?$$$", "BYE BYE<unk>$$$"]) self.assertEqual(batch_tokens, ["HELLO<unk>!? !?$$$", "BYE BYE<unk>$$$"])
self.assertEqual(batch_tokens_2, ["HELO!?!?", "BYE BYE"]) self.assertEqual(batch_tokens_2, ["HELO!? !?", "BYE BYE"])
def test_call(self): def test_call(self):
# Tests that all call wrap to encode_plus and batch_encode_plus # Tests that all call wrap to encode_plus and batch_encode_plus
@@ -467,8 +467,8 @@ class Wav2Vec2CTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
batch_tokens = tokenizer.batch_decode(sample_ids) batch_tokens = tokenizer.batch_decode(sample_ids)
batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True) batch_tokens_2 = tokenizer.batch_decode(sample_ids, skip_special_tokens=True)
self.assertEqual(batch_tokens, ["HELLO<unk>!?!?<new_tokens>$$$", "BYE BYE<unk><new_tokens>$$$"]) self.assertEqual(batch_tokens, ["HELLO<unk>!? !?<new_tokens>$$$", "BYE BYE<unk><new_tokens>$$$"])
self.assertEqual(batch_tokens_2, ["HELO!?!?<new_tokens>", "BYE BYE<new_tokens>"]) self.assertEqual(batch_tokens_2, ["HELO!? !?<new_tokens>", "BYE BYE<new_tokens>"])
def test_special_characters_in_vocab(self): def test_special_characters_in_vocab(self):
sent = "ʈʰ æ æ̃ ˧ kʰ" sent = "ʈʰ æ æ̃ ˧ kʰ"

View File

@@ -249,7 +249,7 @@ class Wav2Vec2PhonemeCTCTokenizerTest(TokenizerTesterMixin, unittest.TestCase):
# fmt: on # fmt: on
batch_tokens = tokenizer.batch_decode(sample_ids) batch_tokens = tokenizer.batch_decode(sample_ids)
self.assertEqual(batch_tokens, ["k s ɾ ɾ l ɭʲ!?!? $$$", "j ð s j ð s oːɹ $$$"]) self.assertEqual(batch_tokens, ["k s ɾ ɾ l ɭʲ ! ? ! ? $$$", "j ð s j ð s oːɹ $$$"])
@staticmethod @staticmethod
def get_from_offsets(offsets, key): def get_from_offsets(offsets, key):