Tokenizer fast save (#11234)
* Save fast tokenizers in both formats * Fix for HerBERT * Proper fix * Properly test new behavior
This commit is contained in:
@@ -2729,7 +2729,10 @@ class TokenizerTesterMixin:
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
# Checks it save with the same files
|
||||
|
||||
# Checks it save with the same files + the tokenizer.json file for the fast one
|
||||
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
|
||||
tokenizer_r_files = tuple(f for f in tokenizer_r_files if "tokenizer.json" not in f)
|
||||
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
@@ -2744,6 +2747,44 @@ class TokenizerTesterMixin:
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
# Save tokenizer rust, legacy_format=True
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=True)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# Checks it save with the same files
|
||||
self.assertSequenceEqual(tokenizer_r_files, tokenizer_p_files)
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
# Save tokenizer rust, legacy_format=False
|
||||
tmpdirname2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_r_files = tokenizer_r.save_pretrained(tmpdirname2, legacy_format=False)
|
||||
tokenizer_p_files = tokenizer_p.save_pretrained(tmpdirname2)
|
||||
|
||||
# Checks it saved the tokenizer.json file
|
||||
self.assertTrue(any("tokenizer.json" in f for f in tokenizer_r_files))
|
||||
|
||||
# Checks everything loads correctly in the same way
|
||||
tokenizer_rp = tokenizer_r.from_pretrained(tmpdirname2)
|
||||
tokenizer_pp = tokenizer_p.from_pretrained(tmpdirname2)
|
||||
|
||||
# Check special tokens are set accordingly on Rust and Python
|
||||
for key in tokenizer_pp.special_tokens_map:
|
||||
self.assertTrue(hasattr(tokenizer_rp, key))
|
||||
|
||||
shutil.rmtree(tmpdirname2)
|
||||
|
||||
def test_embeded_special_tokens(self):
|
||||
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
|
||||
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
|
||||
|
||||
Reference in New Issue
Block a user