From 8d9c3836bea773a243a691bfd80f02168f1db3b7 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Wed, 29 Mar 2023 13:21:07 +0200 Subject: [PATCH] Add clean_up_tokenization_spaces to config (#22341) * add draft changes * fix failing wav2vec * style * make sure that the argument is saved + add tests * style * fixup * update test * default clean_up_tokenization_spaces to False for Bloom and Llama * Update code based on review Co-authored-by: Nicolas Patry * style * quality --------- Co-authored-by: Nicolas Patry --- .../models/big_bird/tokenization_big_bird.py | 7 ++- .../models/bloom/tokenization_bloom_fast.py | 2 + .../models/codegen/tokenization_codegen.py | 7 +-- .../codegen/tokenization_codegen_fast.py | 7 +-- .../models/fnet/tokenization_fnet.py | 7 ++- .../models/llama/tokenization_llama.py | 9 +++- .../models/marian/tokenization_marian.py | 10 +++-- .../models/wav2vec2/tokenization_wav2vec2.py | 22 ++++++--- .../tokenization_wav2vec2_phoneme.py | 15 ++++--- .../models/whisper/tokenization_whisper.py | 7 +-- .../whisper/tokenization_whisper_fast.py | 7 +-- .../models/xlnet/tokenization_xlnet.py | 7 ++- src/transformers/tokenization_utils.py | 7 ++- src/transformers/tokenization_utils_base.py | 26 +++++++---- src/transformers/tokenization_utils_fast.py | 7 ++- tests/test_tokenization_common.py | 45 +++++++++++++++++++ 16 files changed, 150 insertions(+), 42 deletions(-) diff --git a/src/transformers/models/big_bird/tokenization_big_bird.py b/src/transformers/models/big_bird/tokenization_big_bird.py index bd6f90ef02..5220366df4 100644 --- a/src/transformers/models/big_bird/tokenization_big_bird.py +++ b/src/transformers/models/big_bird/tokenization_big_bird.py @@ -204,7 +204,7 @@ class BigBirdTokenizer(PreTrainedTokenizer): self, token_ids: List[int], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, spaces_between_special_tokens: bool = True, **kwargs, ) -> str: @@ -237,6 +237,11 @@ class BigBirdTokenizer(PreTrainedTokenizer): else: text = "".join(sub_texts) + clean_up_tokenization_spaces = ( + clean_up_tokenization_spaces + if clean_up_tokenization_spaces is not None + else self.clean_up_tokenization_spaces + ) if clean_up_tokenization_spaces: clean_text = self.clean_up_tokenization(text) return clean_text diff --git a/src/transformers/models/bloom/tokenization_bloom_fast.py b/src/transformers/models/bloom/tokenization_bloom_fast.py index 800c73f025..d20065c903 100644 --- a/src/transformers/models/bloom/tokenization_bloom_fast.py +++ b/src/transformers/models/bloom/tokenization_bloom_fast.py @@ -115,6 +115,7 @@ class BloomTokenizerFast(PreTrainedTokenizerFast): eos_token="", pad_token="", add_prefix_space=False, + clean_up_tokenization_spaces=False, **kwargs, ): super().__init__( @@ -126,6 +127,7 @@ class BloomTokenizerFast(PreTrainedTokenizerFast): eos_token=eos_token, pad_token=pad_token, add_prefix_space=add_prefix_space, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, **kwargs, ) pre_tok_state = json.loads(self.backend_tokenizer.pre_tokenizer.__getstate__()) diff --git a/src/transformers/models/codegen/tokenization_codegen.py b/src/transformers/models/codegen/tokenization_codegen.py index 0c01e63471..14d79bb1ce 100644 --- a/src/transformers/models/codegen/tokenization_codegen.py +++ b/src/transformers/models/codegen/tokenization_codegen.py @@ -320,7 +320,7 @@ class CodeGenTokenizer(PreTrainedTokenizer): self, token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, truncate_before_pattern: Optional[List[str]] = None, **kwargs, ) -> str: @@ -335,8 +335,9 @@ class CodeGenTokenizer(PreTrainedTokenizer): List of tokenized input ids. Can be obtained using the `__call__` method. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. - clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): - Whether or not to clean up the tokenization spaces. + clean_up_tokenization_spaces (`bool`, *optional*): + Whether or not to clean up the tokenization spaces. If `None`, will default to + `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`). truncate_before_pattern (`List[str]`, *optional*, defaults to `None`): A list of regular expression strings that will be used to truncate the returned string. This can be used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning diff --git a/src/transformers/models/codegen/tokenization_codegen_fast.py b/src/transformers/models/codegen/tokenization_codegen_fast.py index 83e5e30d0b..fad5e24dbc 100644 --- a/src/transformers/models/codegen/tokenization_codegen_fast.py +++ b/src/transformers/models/codegen/tokenization_codegen_fast.py @@ -187,7 +187,7 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast): self, token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, truncate_before_pattern: Optional[List[str]] = None, **kwargs, ) -> str: @@ -202,8 +202,9 @@ class CodeGenTokenizerFast(PreTrainedTokenizerFast): List of tokenized input ids. Can be obtained using the `__call__` method. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. - clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): - Whether or not to clean up the tokenization spaces. + clean_up_tokenization_spaces (`bool`, *optional*): + Whether or not to clean up the tokenization spaces. If `None`, will default to + `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`). truncate_before_pattern (`List[str]`, *optional*, defaults to `None`): A list of regular expression strings that will be used to truncate the returned string. This can be used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning diff --git a/src/transformers/models/fnet/tokenization_fnet.py b/src/transformers/models/fnet/tokenization_fnet.py index 6edcec45b5..7324f509a8 100644 --- a/src/transformers/models/fnet/tokenization_fnet.py +++ b/src/transformers/models/fnet/tokenization_fnet.py @@ -236,7 +236,7 @@ class FNetTokenizer(PreTrainedTokenizer): self, token_ids: List[int], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, spaces_between_special_tokens: bool = True, **kwargs, ) -> str: @@ -269,6 +269,11 @@ class FNetTokenizer(PreTrainedTokenizer): else: text = "".join(sub_texts) + clean_up_tokenization_spaces = ( + clean_up_tokenization_spaces + if clean_up_tokenization_spaces is not None + else self.clean_up_tokenization_spaces + ) if clean_up_tokenization_spaces: clean_text = self.clean_up_tokenization(text) return clean_text diff --git a/src/transformers/models/llama/tokenization_llama.py b/src/transformers/models/llama/tokenization_llama.py index 521bb854f7..618af846ce 100644 --- a/src/transformers/models/llama/tokenization_llama.py +++ b/src/transformers/models/llama/tokenization_llama.py @@ -59,10 +59,17 @@ class LlamaTokenizer(PreTrainedTokenizer): add_bos_token=True, add_eos_token=False, decode_with_prefix_space=False, + clean_up_tokenization_spaces=False, **kwargs, ): self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - super().__init__(bos_token=bos_token, eos_token=eos_token, unk_token=unk_token, **kwargs) + super().__init__( + bos_token=bos_token, + eos_token=eos_token, + unk_token=unk_token, + clean_up_tokenization_spaces=clean_up_tokenization_spaces, + **kwargs, + ) self.vocab_file = vocab_file self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token diff --git a/src/transformers/models/marian/tokenization_marian.py b/src/transformers/models/marian/tokenization_marian.py index aa63b1ff88..96a1f47bf7 100644 --- a/src/transformers/models/marian/tokenization_marian.py +++ b/src/transformers/models/marian/tokenization_marian.py @@ -225,8 +225,9 @@ class MarianTokenizer(PreTrainedTokenizer): List of tokenized input ids. Can be obtained using the `__call__` method. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. - clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): - Whether or not to clean up the tokenization spaces. + clean_up_tokenization_spaces (`bool`, *optional*): + Whether or not to clean up the tokenization spaces. If `None`, will default to + `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`). use_source_tokenizer (`bool`, *optional*, defaults to `False`): Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence problems). @@ -250,8 +251,9 @@ class MarianTokenizer(PreTrainedTokenizer): List of tokenized input ids. Can be obtained using the `__call__` method. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. - clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): - Whether or not to clean up the tokenization spaces. + clean_up_tokenization_spaces (`bool`, *optional*): + Whether or not to clean up the tokenization spaces. If `None`, will default to + `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`). use_source_tokenizer (`bool`, *optional*, defaults to `False`): Whether or not to use the source tokenizer to decode sequences (only applicable in sequence-to-sequence problems). diff --git a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py index 54888aea2c..1708dbf125 100644 --- a/src/transformers/models/wav2vec2/tokenization_wav2vec2.py +++ b/src/transformers/models/wav2vec2/tokenization_wav2vec2.py @@ -373,7 +373,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): self, token_ids: List[int], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, group_tokens: bool = True, spaces_between_special_tokens: bool = False, output_word_offsets: Optional[bool] = False, @@ -402,6 +402,11 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): text = string_output["text"] + clean_up_tokenization_spaces = ( + clean_up_tokenization_spaces + if clean_up_tokenization_spaces is not None + else self.clean_up_tokenization_spaces + ) if clean_up_tokenization_spaces: text = self.clean_up_tokenization(text) @@ -421,7 +426,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): self, sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, output_char_offsets: bool = False, output_word_offsets: bool = False, **kwargs, @@ -434,7 +439,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): List of tokenized input ids. Can be obtained using the `__call__` method. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. - clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + clean_up_tokenization_spaces (`bool`, *optional*): Whether or not to clean up the tokenization spaces. output_char_offsets (`bool`, *optional*, defaults to `False`): Whether or not to output character offsets. Character offsets can be used in combination with the @@ -491,7 +496,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): self, token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, output_char_offsets: bool = False, output_word_offsets: bool = False, **kwargs, @@ -507,7 +512,7 @@ class Wav2Vec2CTCTokenizer(PreTrainedTokenizer): List of tokenized input ids. Can be obtained using the `__call__` method. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. - clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + clean_up_tokenization_spaces (`bool`, *optional*): Whether or not to clean up the tokenization spaces. output_char_offsets (`bool`, *optional*, defaults to `False`): Whether or not to output character offsets. Character offsets can be used in combination with the @@ -887,7 +892,7 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer): self, token_ids: List[int], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, **kwargs, ) -> str: """ @@ -905,6 +910,11 @@ class Wav2Vec2Tokenizer(PreTrainedTokenizer): text = self.convert_tokens_to_string(result) + clean_up_tokenization_spaces = ( + clean_up_tokenization_spaces + if clean_up_tokenization_spaces is not None + else self.clean_up_tokenization_spaces + ) if clean_up_tokenization_spaces: clean_text = self.clean_up_tokenization(text) return clean_text diff --git a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py index f3ad23a1cd..f9a1cf631c 100644 --- a/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py +++ b/src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py @@ -409,7 +409,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): self, token_ids: List[int], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, group_tokens: bool = True, filter_word_delimiter_token: bool = True, spaces_between_special_tokens: bool = False, @@ -438,6 +438,11 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): text = string_output["text"] + clean_up_tokenization_spaces = ( + clean_up_tokenization_spaces + if clean_up_tokenization_spaces is not None + else self.clean_up_tokenization_spaces + ) if clean_up_tokenization_spaces: text = self.clean_up_tokenization(text) @@ -451,7 +456,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): self, token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, output_char_offsets: bool = False, **kwargs, ) -> str: @@ -466,7 +471,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): List of tokenized input ids. Can be obtained using the `__call__` method. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. - clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + clean_up_tokenization_spaces (`bool`, *optional*): Whether or not to clean up the tokenization spaces. output_char_offsets (`bool`, *optional*, defaults to `False`): Whether or not to output character offsets. Character offsets can be used in combination with the @@ -507,7 +512,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): self, sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, output_char_offsets: bool = False, **kwargs, ) -> List[str]: @@ -519,7 +524,7 @@ class Wav2Vec2PhonemeCTCTokenizer(PreTrainedTokenizer): List of tokenized input ids. Can be obtained using the `__call__` method. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. - clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + clean_up_tokenization_spaces (`bool`, *optional*): Whether or not to clean up the tokenization spaces. output_char_offsets (`bool`, *optional*, defaults to `False`): Whether or not to output character offsets. Character offsets can be used in combination with the diff --git a/src/transformers/models/whisper/tokenization_whisper.py b/src/transformers/models/whisper/tokenization_whisper.py index 3d795e5b87..0160237304 100644 --- a/src/transformers/models/whisper/tokenization_whisper.py +++ b/src/transformers/models/whisper/tokenization_whisper.py @@ -556,7 +556,7 @@ class WhisperTokenizer(PreTrainedTokenizer): self, token_ids, skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, output_offsets: bool = False, time_precision=0.02, decode_with_timestamps: bool = False, @@ -573,8 +573,9 @@ class WhisperTokenizer(PreTrainedTokenizer): List of tokenized input ids. Can be obtained using the `__call__` method. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. - clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): - Whether or not to clean up the tokenization spaces. + clean_up_tokenization_spaces (`bool`, *optional*): + Whether or not to clean up the tokenization spaces. If `None`, will default to + `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`). kwargs (additional keyword arguments, *optional*): Will be passed to the underlying model specific decode method. output_offsets (`bool`, *optional*, defaults to `False`): diff --git a/src/transformers/models/whisper/tokenization_whisper_fast.py b/src/transformers/models/whisper/tokenization_whisper_fast.py index 3110aac8b1..fb1bf89ed6 100644 --- a/src/transformers/models/whisper/tokenization_whisper_fast.py +++ b/src/transformers/models/whisper/tokenization_whisper_fast.py @@ -266,7 +266,7 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast): self, token_ids, skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, output_offsets: bool = False, time_precision=0.02, decode_with_timestamps: bool = False, @@ -283,8 +283,9 @@ class WhisperTokenizerFast(PreTrainedTokenizerFast): List of tokenized input ids. Can be obtained using the `__call__` method. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. - clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): - Whether or not to clean up the tokenization spaces. + clean_up_tokenization_spaces (`bool`, *optional*): + Whether or not to clean up the tokenization spaces. If `None`, will default to + `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`). kwargs (additional keyword arguments, *optional*): Will be passed to the underlying model specific decode method. output_offsets (`bool`, *optional*, defaults to `False`): diff --git a/src/transformers/models/xlnet/tokenization_xlnet.py b/src/transformers/models/xlnet/tokenization_xlnet.py index 5ad655c417..ec72df8c8b 100644 --- a/src/transformers/models/xlnet/tokenization_xlnet.py +++ b/src/transformers/models/xlnet/tokenization_xlnet.py @@ -254,7 +254,7 @@ class XLNetTokenizer(PreTrainedTokenizer): self, token_ids: List[int], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, spaces_between_special_tokens: bool = True, **kwargs, ) -> str: @@ -284,6 +284,11 @@ class XLNetTokenizer(PreTrainedTokenizer): # By default, there are no spaces between special tokens text = "".join(sub_texts) + clean_up_tokenization_spaces = ( + clean_up_tokenization_spaces + if clean_up_tokenization_spaces is not None + else self.clean_up_tokenization_spaces + ) if clean_up_tokenization_spaces: clean_text = self.clean_up_tokenization(text) return clean_text diff --git a/src/transformers/tokenization_utils.py b/src/transformers/tokenization_utils.py index 3398ee3091..a1454a744d 100644 --- a/src/transformers/tokenization_utils.py +++ b/src/transformers/tokenization_utils.py @@ -922,7 +922,7 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): self, token_ids: List[int], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, spaces_between_special_tokens: bool = True, **kwargs, ) -> str: @@ -953,6 +953,11 @@ class PreTrainedTokenizer(PreTrainedTokenizerBase): else: text = "".join(sub_texts) + clean_up_tokenization_spaces = ( + clean_up_tokenization_spaces + if clean_up_tokenization_spaces is not None + else self.clean_up_tokenization_spaces + ) if clean_up_tokenization_spaces: clean_text = self.clean_up_tokenization(text) return clean_text diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index 66164c2778..6162baf6f4 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -1470,6 +1470,9 @@ INIT_TOKENIZER_DOCSTRING = r""" A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the tokenization process. Will be associated to `self.additional_special_tokens` and `self.additional_special_tokens_ids`. + clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): + Whether or not the model should cleanup the spaces that were added when splitting the input text during the + tokenization process. """ @@ -1521,6 +1524,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) + # By default, cleaning tokenization spaces for both fast and slow tokenizers + self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True) + self.deprecation_warnings = ( {} ) # Use to store when we have already noticed a deprecation warning (avoid overlogging). @@ -1576,7 +1582,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): f"{self.__class__.__name__}(name_or_path='{self.name_or_path}'," f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast}," f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}'," - f" special_tokens={self.special_tokens_map_extended})" + f" special_tokens={self.special_tokens_map_extended}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces})" ) def __len__(self) -> int: @@ -2112,7 +2118,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): # TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers # target_keys = self.init_kwargs.keys() - target_keys = ["model_max_length"] + target_keys = ["model_max_length", "clean_up_tokenization_spaces"] for k in target_keys: if hasattr(self, k): tokenizer_config[k] = getattr(self, k) @@ -3416,7 +3422,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): self, sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, **kwargs, ) -> List[str]: """ @@ -3427,8 +3433,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): List of tokenized input ids. Can be obtained using the `__call__` method. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. - clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): - Whether or not to clean up the tokenization spaces. + clean_up_tokenization_spaces (`bool`, *optional*): + Whether or not to clean up the tokenization spaces. If `None`, will default to + `self.clean_up_tokenization_spaces`. kwargs (additional keyword arguments, *optional*): Will be passed to the underlying model specific decode method. @@ -3449,7 +3456,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): self, token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, **kwargs, ) -> str: """ @@ -3463,8 +3470,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): List of tokenized input ids. Can be obtained using the `__call__` method. skip_special_tokens (`bool`, *optional*, defaults to `False`): Whether or not to remove special tokens in the decoding. - clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`): - Whether or not to clean up the tokenization spaces. + clean_up_tokenization_spaces (`bool`, *optional*): + Whether or not to clean up the tokenization spaces. If `None`, will default to + `self.clean_up_tokenization_spaces`. kwargs (additional keyword arguments, *optional*): Will be passed to the underlying model specific decode method. @@ -3485,7 +3493,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin): self, token_ids: Union[int, List[int]], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, **kwargs, ) -> str: raise NotImplementedError diff --git a/src/transformers/tokenization_utils_fast.py b/src/transformers/tokenization_utils_fast.py index b484464f68..75d86d1081 100644 --- a/src/transformers/tokenization_utils_fast.py +++ b/src/transformers/tokenization_utils_fast.py @@ -539,7 +539,7 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): self, token_ids: Union[int, List[int]], skip_special_tokens: bool = False, - clean_up_tokenization_spaces: bool = True, + clean_up_tokenization_spaces: bool = None, **kwargs, ) -> str: self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False) @@ -548,6 +548,11 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase): token_ids = [token_ids] text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens) + clean_up_tokenization_spaces = ( + clean_up_tokenization_spaces + if clean_up_tokenization_spaces is not None + else self.clean_up_tokenization_spaces + ) if clean_up_tokenization_spaces: clean_text = self.clean_up_tokenization(text) return clean_text diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index a8bdb7372b..53a12aedd6 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -3895,6 +3895,51 @@ class TokenizerTesterMixin: # Should not raise an error self.rust_tokenizer_class.from_pretrained(tmp_dir_2) + def test_clean_up_tokenization_spaces(self): + tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") + assert tokenizer.clean_up_tokenization_spaces is True + + tokens = tokenizer.encode("This shouldn't be! He'll go.") + decoded = tokenizer.decode(tokens) + assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]" + + tokenizer.clean_up_tokenization_spaces = False + decoded = tokenizer.decode(tokens) + assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]" + assert decoded == tokenizer.decode(tokens, clean_up_tokenization_spaces=False) + + # Fast from slow + with tempfile.TemporaryDirectory() as tmp_dir_2: + tokenizer.save_pretrained(tmp_dir_2) + tokenizer_fast = BertTokenizerFast.from_pretrained(tmp_dir_2) + del tokenizer + + assert tokenizer_fast.clean_up_tokenization_spaces is False + decoded = tokenizer_fast.decode(tokens) + # fast and slow don't have the same output when we don't cleanup + # tokenization space. Here `be!` vs `be !` and `go.` vs `go .` + assert decoded == "[CLS] this shouldn ' t be! he ' ll go. [SEP]" + + tokenizer_fast.clean_up_tokenization_spaces = True + assert tokenizer_fast.clean_up_tokenization_spaces is True + + decoded = tokenizer_fast.decode(tokens) + assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]" + + # Slow from fast + with tempfile.TemporaryDirectory() as tmp_dir_2: + tokenizer_fast.clean_up_tokenization_spaces = False + tokenizer_fast.save_pretrained(tmp_dir_2) + tokenizer = BertTokenizer.from_pretrained(tmp_dir_2) + + assert tokenizer_fast.clean_up_tokenization_spaces is False + decoded = tokenizer.decode(tokens) + assert decoded == "[CLS] this shouldn ' t be ! he ' ll go . [SEP]" + + tokenizer.clean_up_tokenization_spaces = True + decoded = tokenizer.decode(tokens) + assert decoded == "[CLS] this shouldn't be! he'll go. [SEP]" + class TokenizerUtilTester(unittest.TestCase): def test_cached_files_are_used_when_internet_is_down(self):