[split_special_tokens] Add support for split_special_tokens argument to encode (#25081)

* draft changes

* update and add tests

* styling for no

* move test

* path to usable model

* update test

* small update

* update bertbased tokenizers

* don'tuse kwargs for _tokenize

* don'tuse kwargs for _tokenize

* fix copies

* update

* update test for special tokenizers

* fixup

* skip two tests

* remove pdb breakpiont()

* wowo

* rewrite custom tests

* nits

* revert chang in target keys

* fix markup lm

* update documentation of the argument
This commit is contained in:
Arthur
2023-08-18 13:26:27 +02:00
committed by GitHub
parent 9d7afd2536
commit 30b3c46ff5
18 changed files with 122 additions and 24 deletions

View File

@@ -144,6 +144,19 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_2)
self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_3)
def test_split_special_tokens(self):
tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutxlm-base")
_, _, boxes = self.get_question_words_and_boxes()
special_token = "[SPECIAL_TOKEN]"
tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
encoded_special_token = tokenizer.tokenize(special_token, boxes=boxes, add_special_tokens=False)
self.assertEqual(len(encoded_special_token), 1)
encoded_split_special_token = tokenizer.tokenize(
special_token, add_special_tokens=False, split_special_tokens=True, boxes=boxes
)
self.assertTrue(len(encoded_split_special_token) > 1)
@slow
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutxlm-base")