Fix slow GemmaTokenizer and improve SPM slow -> fast conversion process (#32191)
* Remove user-defined tokens which can be obtained through merges * Remove debug line * formatting * Refactor spm slow -> fast converter * revert unnecessary refactor * set comprehension * remove test files * Use `vocab_scores` * Always replace spiece underline with space in decode * we no longer need token filtering * Add save fast load slow unit test * Remove tokenizers version check * Remove duplicate code * Make `<start_of_turn>` and `<end_of_turn>` special tokens * Bias merge priority with length if score is the same * Add unit test for merge priority * CI
This commit is contained in:
@@ -222,6 +222,17 @@ class GemmaIntegrationTest(unittest.TestCase):
|
||||
self.tokenizer.add_eos_token = False
|
||||
self.rust_tokenizer.add_eos_token = False
|
||||
|
||||
def test_fast_merge_priority(self):
|
||||
slow_tokenizer = self.tokenizer
|
||||
fast_tokenizer = self.rust_tokenizer
|
||||
text = " "
|
||||
target = [168, 153]
|
||||
slow = slow_tokenizer.encode(text, add_special_tokens=False)
|
||||
assert slow == target
|
||||
|
||||
fast = fast_tokenizer.encode(text, add_special_tokens=False)
|
||||
assert fast == target
|
||||
|
||||
@unittest.skip(reason="Not super important and always failing. Let's skip it")
|
||||
@slow
|
||||
def test_conversion(self):
|
||||
@@ -442,6 +453,30 @@ class GemmaIntegrationTest(unittest.TestCase):
|
||||
for tokenized_chat, expected_tokens in zip(tokenized_chats, expected_tokens):
|
||||
self.assertListEqual(tokenized_chat, expected_tokens)
|
||||
|
||||
def test_save_fast_load_slow(self):
|
||||
# Ensure that we can save a fast tokenizer and load it as a slow tokenizer
|
||||
slow_tokenizer = self.tokenizer
|
||||
text = "a "
|
||||
target_encoded = [2, 235250, 139]
|
||||
slow = slow_tokenizer.encode(text, add_special_tokens=True)
|
||||
assert slow == target_encoded
|
||||
|
||||
slow_decoded = slow_tokenizer.decode(slow, skip_special_tokens=True)
|
||||
assert slow_decoded == text
|
||||
|
||||
with tempfile.TemporaryDirectory() as dirname:
|
||||
# Save fast tokenizer
|
||||
self.rust_tokenizer.save_pretrained(dirname)
|
||||
|
||||
# Load slow tokenizer with fast files present in the directory
|
||||
slow_tokenizer_from_fast = GemmaTokenizer.from_pretrained(dirname)
|
||||
|
||||
slow_from_fast = slow_tokenizer_from_fast.encode(text, add_special_tokens=True)
|
||||
assert slow_from_fast == target_encoded
|
||||
|
||||
slow_from_fast_decoded = slow_tokenizer_from_fast.decode(slow, skip_special_tokens=True)
|
||||
assert slow_from_fast_decoded == text
|
||||
|
||||
|
||||
@require_sentencepiece
|
||||
@require_tokenizers
|
||||
|
||||
Reference in New Issue
Block a user