[split_special_tokens] Add support for split_special_tokens argument to encode (#25081)
* draft changes * update and add tests * styling for no * move test * path to usable model * update test * small update * update bertbased tokenizers * don'tuse kwargs for _tokenize * don'tuse kwargs for _tokenize * fix copies * update * update test for special tokenizers * fixup * skip two tests * remove pdb breakpiont() * wowo * rewrite custom tests * nits * revert chang in target keys * fix markup lm * update documentation of the argument
This commit is contained in:
@@ -1344,6 +1344,19 @@ class MarkupLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
|
||||
self.assertTrue(special_token_id in p_output)
|
||||
self.assertTrue(special_token_id in cr_output)
|
||||
|
||||
def test_split_special_tokens(self):
|
||||
# TODO this is only possible for slow currently
|
||||
tokenizer = self.get_tokenizer()
|
||||
special_token = "[SPECIAL_TOKEN]"
|
||||
tokenizer.add_special_tokens({"additional_special_tokens": [special_token]})
|
||||
encoded_special_token = tokenizer.tokenize(special_token, add_special_tokens=False)
|
||||
self.assertEqual(len(encoded_special_token), 1)
|
||||
|
||||
encoded_split_special_token = tokenizer.tokenize(
|
||||
special_token, add_special_tokens=False, split_special_tokens=True
|
||||
)
|
||||
self.assertTrue(len(encoded_split_special_token) > 1)
|
||||
|
||||
def test_training_new_tokenizer(self):
|
||||
# This feature only exists for fast tokenizers
|
||||
if not self.test_rust_tokenizer:
|
||||
|
||||
Reference in New Issue
Block a user