Add clean_up_tokenization_spaces to config (#22341)

* add draft changes

* fix failing wav2vec

* style

* make sure that the argument is saved + add tests

* style

* fixup

* update test

* default clean_up_tokenization_spaces to False for Bloom and Llama

* Update code based on review

Co-authored-by: Nicolas Patry <patry.nicolas@gmail.com>

* style

* quality

---------

Co-authored-by: Nicolas Patry <patry.nicolas@gmail.com>
This commit is contained in:
Arthur
2023-03-29 13:21:07 +02:00
committed by GitHub
parent b29fd6971d
commit 8d9c3836be
16 changed files with 150 additions and 42 deletions

View File

@@ -1470,6 +1470,9 @@ INIT_TOKENIZER_DOCSTRING = r"""
A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
tokenization process. Will be associated to `self.additional_special_tokens` and
`self.additional_special_tokens_ids`.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not the model should cleanup the spaces that were added when splitting the input text during the
tokenization process.
"""
@@ -1521,6 +1524,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
# By default, cleaning tokenization spaces for both fast and slow tokenizers
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)
self.deprecation_warnings = (
{}
) # Use to store when we have already noticed a deprecation warning (avoid overlogging).
@@ -1576,7 +1582,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast},"
f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
f" special_tokens={self.special_tokens_map_extended})"
f" special_tokens={self.special_tokens_map_extended}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces})"
)
def __len__(self) -> int:
@@ -2112,7 +2118,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
# TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers
# target_keys = self.init_kwargs.keys()
target_keys = ["model_max_length"]
target_keys = ["model_max_length", "clean_up_tokenization_spaces"]
for k in target_keys:
if hasattr(self, k):
tokenizer_config[k] = getattr(self, k)
@@ -3416,7 +3422,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
self,
sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
clean_up_tokenization_spaces: bool = None,
**kwargs,
) -> List[str]:
"""
@@ -3427,8 +3433,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to clean up the tokenization spaces.
clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces`.
kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method.
@@ -3449,7 +3456,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
self,
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
clean_up_tokenization_spaces: bool = None,
**kwargs,
) -> str:
"""
@@ -3463,8 +3470,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
List of tokenized input ids. Can be obtained using the `__call__` method.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
Whether or not to clean up the tokenization spaces.
clean_up_tokenization_spaces (`bool`, *optional*):
Whether or not to clean up the tokenization spaces. If `None`, will default to
`self.clean_up_tokenization_spaces`.
kwargs (additional keyword arguments, *optional*):
Will be passed to the underlying model specific decode method.
@@ -3485,7 +3493,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
self,
token_ids: Union[int, List[int]],
skip_special_tokens: bool = False,
clean_up_tokenization_spaces: bool = True,
clean_up_tokenization_spaces: bool = None,
**kwargs,
) -> str:
raise NotImplementedError