Add clean_up_tokenization_spaces to config (#22341)
* add draft changes * fix failing wav2vec * style * make sure that the argument is saved + add tests * style * fixup * update test * default clean_up_tokenization_spaces to False for Bloom and Llama * Update code based on review Co-authored-by: Nicolas Patry <patry.nicolas@gmail.com> * style * quality --------- Co-authored-by: Nicolas Patry <patry.nicolas@gmail.com>
This commit is contained in:
@@ -1470,6 +1470,9 @@ INIT_TOKENIZER_DOCSTRING = r"""
|
||||
A tuple or a list of additional special tokens. Add them here to ensure they won't be split by the
|
||||
tokenization process. Will be associated to `self.additional_special_tokens` and
|
||||
`self.additional_special_tokens_ids`.
|
||||
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not the model should cleanup the spaces that were added when splitting the input text during the
|
||||
tokenization process.
|
||||
"""
|
||||
|
||||
|
||||
@@ -1521,6 +1524,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
|
||||
self.model_input_names = kwargs.pop("model_input_names", self.model_input_names)
|
||||
|
||||
# By default, cleaning tokenization spaces for both fast and slow tokenizers
|
||||
self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)
|
||||
|
||||
self.deprecation_warnings = (
|
||||
{}
|
||||
) # Use to store when we have already noticed a deprecation warning (avoid overlogging).
|
||||
@@ -1576,7 +1582,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
f"{self.__class__.__name__}(name_or_path='{self.name_or_path}',"
|
||||
f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast},"
|
||||
f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}',"
|
||||
f" special_tokens={self.special_tokens_map_extended})"
|
||||
f" special_tokens={self.special_tokens_map_extended}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces})"
|
||||
)
|
||||
|
||||
def __len__(self) -> int:
|
||||
@@ -2112,7 +2118,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
|
||||
# TODO: Ensure the modified attributes (those are also in the __init__ kwargs) will give identical tokenizers
|
||||
# target_keys = self.init_kwargs.keys()
|
||||
target_keys = ["model_max_length"]
|
||||
target_keys = ["model_max_length", "clean_up_tokenization_spaces"]
|
||||
for k in target_keys:
|
||||
if hasattr(self, k):
|
||||
tokenizer_config[k] = getattr(self, k)
|
||||
@@ -3416,7 +3422,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
self,
|
||||
sequences: Union[List[int], List[List[int]], "np.ndarray", "torch.Tensor", "tf.Tensor"],
|
||||
skip_special_tokens: bool = False,
|
||||
clean_up_tokenization_spaces: bool = True,
|
||||
clean_up_tokenization_spaces: bool = None,
|
||||
**kwargs,
|
||||
) -> List[str]:
|
||||
"""
|
||||
@@ -3427,8 +3433,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
List of tokenized input ids. Can be obtained using the `__call__` method.
|
||||
skip_special_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to remove special tokens in the decoding.
|
||||
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to clean up the tokenization spaces.
|
||||
clean_up_tokenization_spaces (`bool`, *optional*):
|
||||
Whether or not to clean up the tokenization spaces. If `None`, will default to
|
||||
`self.clean_up_tokenization_spaces`.
|
||||
kwargs (additional keyword arguments, *optional*):
|
||||
Will be passed to the underlying model specific decode method.
|
||||
|
||||
@@ -3449,7 +3456,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
self,
|
||||
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
|
||||
skip_special_tokens: bool = False,
|
||||
clean_up_tokenization_spaces: bool = True,
|
||||
clean_up_tokenization_spaces: bool = None,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""
|
||||
@@ -3463,8 +3470,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
List of tokenized input ids. Can be obtained using the `__call__` method.
|
||||
skip_special_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to remove special tokens in the decoding.
|
||||
clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
|
||||
Whether or not to clean up the tokenization spaces.
|
||||
clean_up_tokenization_spaces (`bool`, *optional*):
|
||||
Whether or not to clean up the tokenization spaces. If `None`, will default to
|
||||
`self.clean_up_tokenization_spaces`.
|
||||
kwargs (additional keyword arguments, *optional*):
|
||||
Will be passed to the underlying model specific decode method.
|
||||
|
||||
@@ -3485,7 +3493,7 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
||||
self,
|
||||
token_ids: Union[int, List[int]],
|
||||
skip_special_tokens: bool = False,
|
||||
clean_up_tokenization_spaces: bool = True,
|
||||
clean_up_tokenization_spaces: bool = None,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
raise NotImplementedError
|
||||
|
||||
Reference in New Issue
Block a user