[split_special_tokens] Add support for split_special_tokens argument to encode (#25081)

* draft changes * update and add tests * styling for no * move test * path to usable model * update test * small update * update bertbased tokenizers * don'tuse kwargs for _tokenize * don'tuse kwargs for _tokenize * fix copies * update * update test for special tokenizers * fixup * skip two tests * remove pdb breakpiont() * wowo * rewrite custom tests * nits * revert chang in target keys * fix markup lm * update documentation of the argument
2023-08-18 13:26:27 +02:00
parent 9d7afd2536
commit 30b3c46ff5
18 changed files with 122 additions and 24 deletions
--- a/src/transformers/tokenization_utils_base.py
+++ b/src/transformers/tokenization_utils_base.py
@@ -1492,6 +1492,11 @@ INIT_TOKENIZER_DOCSTRING = r"""
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `True`):
            Whether or not the model should cleanup the spaces that were added when splitting the input text during the
            tokenization process.
+        split_special_tokens (`bool`, *optional*, defaults to `False`):
+            Whether or not the special tokens should be split during the tokenization process. The default behavior is
+            to not split special tokens. This means that if `<s>` is the `bos_token`, then `tokenizer.tokenize("<s>") =
+            ['<s>`]. Otherwise, if `split_special_tokens=True`, then `tokenizer.tokenize("<s>")` will be give `['<',
+            's', '>']`. This argument is only supported for `slow` tokenizers for the moment.
 """


@@ -1546,6 +1551,9 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
        # By default, cleaning tokenization spaces for both fast and slow tokenizers
        self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True)

+        # By default, do not split special tokens for both fast and slow tokenizers
+        self.split_special_tokens = kwargs.pop("split_special_tokens", False)
+
        self.deprecation_warnings = (
            {}
        )  # Use to store when we have already noticed a deprecation warning (avoid overlogging).