From 2229ebe7220fb54bc5f91f575c2d7a988e7122cb Mon Sep 17 00:00:00 2001 From: Ita Zaporozhets <31893021+itazap@users.noreply.github.com> Date: Thu, 1 Aug 2024 13:57:41 +0200 Subject: [PATCH] update clean_up_tokenization_spaces warning (#32371) --- src/transformers/tokenization_utils_base.py | 8 ++++ tests/test_tokenization_common.py | 46 --------------------- 2 files changed, 8 insertions(+), 46 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 6142dad13c..60c52633a7 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1593,6 +1593,14 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) + if "clean_up_tokenization_spaces" not in kwargs: + warnings.warn( + "`clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This " + "behavior will be depracted in transformers v4.45, and will be then set to `False` by default. " + "For more details check this issue: https://github.com/huggingface/transformers/issues/31884", + FutureWarning, + ) + # By default, cleaning tokenization spaces for both fast and slow tokenizers self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True) diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index a1fb5124a4..021557c1a5 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -4247,52 +4247,6 @@ class TokenizerTesterMixin: # Should not raise an error self.rust_tokenizer_class.from_pretrained(tmp_dir_2) - # TODO This is ran for all models but only tests bert... - def test_clean_up_tokenization_spaces(self): - tokenizer = BertTokenizer.from_pretrained("google-bert/bert-base-uncased") - assert tokenizer.clean_up_tokenization_spaces is True - - tokens = tokenizer.encode("This shouldn't be! He'll go.") - decoded = tokenizer.decode(tokens) - assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]" - - tokenizer.clean_up_tokenization_spaces = False - decoded = tokenizer.decode(tokens) - assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]" - assert decoded == tokenizer.decode(tokens, clean_up_tokenization_spaces=False) - - # Fast from slow - with tempfile.TemporaryDirectory() as tmp_dir_2: - tokenizer.save_pretrained(tmp_dir_2) - tokenizer_fast = BertTokenizerFast.from_pretrained(tmp_dir_2) - del tokenizer - - assert tokenizer_fast.clean_up_tokenization_spaces is False - decoded = tokenizer_fast.decode(tokens) - # fast and slow don't have the same output when we don't cleanup - # tokenization space. Here `be!` vs `be !` and `go.` vs `go .` - assert decoded == "[CLS] this shouldn ' t be! he ' ll go. [SEP]" - - tokenizer_fast.clean_up_tokenization_spaces = True - assert tokenizer_fast.clean_up_tokenization_spaces is True - - decoded = tokenizer_fast.decode(tokens) - assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]" - - # Slow from fast - with tempfile.TemporaryDirectory() as tmp_dir_2: - tokenizer_fast.clean_up_tokenization_spaces = False - tokenizer_fast.save_pretrained(tmp_dir_2) - tokenizer = BertTokenizer.from_pretrained(tmp_dir_2) - - assert tokenizer.clean_up_tokenization_spaces is False - decoded = tokenizer.decode(tokens) - assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]" - - tokenizer.clean_up_tokenization_spaces = True - decoded = tokenizer.decode(tokens) - assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]" - def test_split_special_tokens(self): if not self.test_slow_tokenizer: self.skipTest(reason="test_slow_tokenizer is set to False")