[LlamaTokenizerFast] nit update post_processor on the fly (#23855)

* Update the processor when changing add_eos and add_bos

* fixup

* update

* add a test

* fix failing tests

* fixup
This commit is contained in:
Arthur
2023-05-30 16:50:41 +02:00
committed by GitHub
parent 0623f08e99
commit 6fc0454b2f
2 changed files with 77 additions and 0 deletions

View File

@@ -315,6 +315,39 @@ class LlamaIntegrationTest(unittest.TestCase):
},
)
def test_fast_special_tokens(self):
slow_tokenizer = self.tokenizer
fast_tokenizer = self.rust_tokenizer
slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
assert slow == [1, 319, 4559, 1243]
fast_tokenizer.add_eos_token = False
fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
assert fast == [1, 319, 4559, 1243]
fast_tokenizer.add_eos_token = True
fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
assert fast == [1, 319, 4559, 1243, 2]
slow_tokenizer.add_eos_token = True
slow = slow_tokenizer.encode("A sample test", add_special_tokens=True)
assert slow == [1, 319, 4559, 1243, 2]
fast_tokenizer = LlamaTokenizerFast.from_pretrained(
"hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False
)
fast = fast_tokenizer.encode("A sample test", add_special_tokens=True)
assert fast == [319, 4559, 1243, 2]
slow_tokenzier = LlamaTokenizer.from_pretrained(
"hf-internal-testing/llama-tokenizer", add_eos_token=True, add_bos_token=False
)
slow = slow_tokenzier.encode("A sample test", add_special_tokens=True)
assert slow == [319, 4559, 1243, 2]
self.tokenizer.add_eos_token = False
self.rust_tokenizer.add_eos_token = False
@slow
def test_conversion(self):
# This is excruciatingly slow since it has to recreate the entire merge