Add split special tokens (#30772)

* seems like `split_special_tokens` is used here

* split special token

* add new line at end of file

* moving split special token test to common tests

* added assertions

* test

* fixup

* add co-author

* passing rest of args to gptsan_japanese, fixing tests

* removing direct comparison of fast and slow models

* adding test support for UDOP and LayoutXLM

* ruff fix

* readd check if slow tokenizer

* modify test to handle bos tokens

* removing commented function

* trigger build

* applying review feedback - updated docstrings, var names, and simplified tests

* ruff fixes

* Update tests/test_tokenization_common.py

Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>

* applying feedback, comments

* shutil temp directory fix

---------

Co-authored-by: Arthur Zucker <arthur.zucker@gmail.com>
Co-authored-by: Ita Zaporozhets <itazaporozhets@Itas-MBP.localdomain>
Co-authored-by: itazap <itazap@users.noreply.github.com>
Co-authored-by: Arthur <48595927+ArthurZucker@users.noreply.github.com>
Co-authored-by: Ita Zaporozhets <itazaporozhets@Itas-MacBook-Pro.local>
This commit is contained in:
Ita Zaporozhets
2024-05-24 17:38:58 +02:00
committed by GitHub
parent e5103a76cc
commit deba7655e6
9 changed files with 165 additions and 39 deletions

View File

@@ -1921,3 +1921,48 @@ class UdopTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
excepted_decoding = "<pad> paragraph<loc_58><loc_34><loc_446><loc_449></s>"
assert decoding == excepted_decoding
def test_split_special_tokens(self):
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
special_token = "<my_new_token>"
special_sentence = f"Hey this is a {special_token} token"
_, _, boxes = self.get_question_words_and_boxes()
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
tokenizer_rust = self.rust_tokenizer_class.from_pretrained(
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
)
tokenizer_py = self.tokenizer_class.from_pretrained(
pretrained_name, additional_special_tokens=[special_token], split_special_tokens=True, **kwargs
)
special_token_id = tokenizer_py.convert_tokens_to_ids(special_token)
encoded_special_token_unsplit = tokenizer_py.encode(
special_token, add_special_tokens=False, split_special_tokens=False
)
self.assertTrue(special_token_id in encoded_special_token_unsplit)
encoded_special_token_split = tokenizer_py.encode(special_token, add_special_tokens=False)
self.assertTrue(special_token_id not in encoded_special_token_split)
py_tokens_output = tokenizer_py.tokenize(special_sentence)
rust_tokens_output = tokenizer_rust.tokenize(special_sentence)
self.assertTrue(special_token not in py_tokens_output)
self.assertTrue(special_token not in rust_tokens_output)
py_tokens_output_unsplit = tokenizer_py.tokenize(special_sentence, split_special_tokens=False)
rust_tokens_output_unsplit = tokenizer_rust.tokenize(special_sentence, split_special_tokens=False)
self.assertTrue(special_token in py_tokens_output_unsplit)
self.assertTrue(special_token in rust_tokens_output_unsplit)
tmpdirname = tempfile.mkdtemp()
tokenizer_py.save_pretrained(tmpdirname)
fast_from_saved = self.tokenizer_class.from_pretrained(tmpdirname)
output_tokens_reloaded_split = fast_from_saved.tokenize(special_sentence)
self.assertTrue(special_token not in output_tokens_reloaded_split)
output_tokens_reloaded_unsplit = fast_from_saved.tokenize(special_sentence, split_special_tokens=False)
self.assertTrue(special_token in output_tokens_reloaded_unsplit)