SPLIT PR: add user defined symbols and control symbols (#31305)

* PR SPLIT: moving origina changes for adding user defined symbols

* adding gemma test and generalizing gemma converter

* ruff

* update common test

* update serialization test

* deberta v2 tests updates as rust version adds '.' as a user added token, so a space is not added

* removing commented lines

* applying feedback - user only added_tokens to add and check piece.type instead of trainer_spec for user_defined_symbols

* add comment referencing sentencepiece
This commit is contained in:
Ita Zaporozhets
2024-06-21 10:48:10 +02:00
committed by GitHub
parent 730a440734
commit 1e79eade41
6 changed files with 60 additions and 23 deletions

View File

@@ -193,6 +193,19 @@ class GemmaIntegrationTest(unittest.TestCase):
},
)
def test_user_added_tokens(self):
# Ensure that user added tokens are not split in the fast tokenizer
slow_tokenizer = self.tokenizer
fast_tokenizer = self.rust_tokenizer
user_added_token = "<mask>"
slow_tokens = slow_tokenizer.convert_ids_to_tokens(slow_tokenizer.encode(user_added_token))
fast_tokens = slow_tokenizer.convert_ids_to_tokens(fast_tokenizer.encode(user_added_token))
self.assertTrue(user_added_token in fast_tokens)
self.assertEqual(slow_tokens, fast_tokens)
def test_fast_special_tokens(self):
slow_tokenizer = self.tokenizer
fast_tokenizer = self.rust_tokenizer