Fix slow GemmaTokenizer and improve SPM slow -> fast conversion process (#32191)

* Remove user-defined tokens which can be obtained through merges

* Remove debug line

* formatting

* Refactor spm slow -> fast converter

* revert unnecessary refactor

* set comprehension

* remove test files

* Use `vocab_scores`

* Always replace spiece underline with space in decode

* we no longer need token filtering

* Add save fast load slow unit test

* Remove tokenizers version check

* Remove duplicate code

* Make `<start_of_turn>` and `<end_of_turn>` special tokens

* Bias merge priority with length if score is the same

* Add unit test for merge priority

* CI
This commit is contained in:
Joshua Lochner
2024-07-30 23:36:38 +02:00
committed by GitHub
parent 026a173a64
commit 6e2d04e429
3 changed files with 120 additions and 151 deletions

View File

@@ -222,6 +222,17 @@ class GemmaIntegrationTest(unittest.TestCase):
self.tokenizer.add_eos_token = False
self.rust_tokenizer.add_eos_token = False
def test_fast_merge_priority(self):
slow_tokenizer = self.tokenizer
fast_tokenizer = self.rust_tokenizer
text = " "
target = [168, 153]
slow = slow_tokenizer.encode(text, add_special_tokens=False)
assert slow == target
fast = fast_tokenizer.encode(text, add_special_tokens=False)
assert fast == target
@unittest.skip(reason="Not super important and always failing. Let's skip it")
@slow
def test_conversion(self):
@@ -442,6 +453,30 @@ class GemmaIntegrationTest(unittest.TestCase):
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
self.assertListEqual(tokenized_chat, expected_tokens)
def test_save_fast_load_slow(self):
# Ensure that we can save a fast tokenizer and load it as a slow tokenizer
slow_tokenizer = self.tokenizer
text = "a "
target_encoded = [2, 235250, 139]
slow = slow_tokenizer.encode(text, add_special_tokens=True)
assert slow == target_encoded
slow_decoded = slow_tokenizer.decode(slow, skip_special_tokens=True)
assert slow_decoded == text
with tempfile.TemporaryDirectory() as dirname:
# Save fast tokenizer
self.rust_tokenizer.save_pretrained(dirname)
# Load slow tokenizer with fast files present in the directory
slow_tokenizer_from_fast = GemmaTokenizer.from_pretrained(dirname)
slow_from_fast = slow_tokenizer_from_fast.encode(text, add_special_tokens=True)
assert slow_from_fast == target_encoded
slow_from_fast_decoded = slow_tokenizer_from_fast.decode(slow, skip_special_tokens=True)
assert slow_from_fast_decoded == text
@require_sentencepiece
@require_tokenizers