improve saving strategy of sentencepiece tokenizer (#15328)
* add new test * add a feature to same the sentencepiece tokenizer model when the init file was deleted * update marian * update m2m_100 * fix marian * update speech to text * override test for layoutxlm * fix saving bartpho * remove harcoded values bartpho * special token string version * finish bartpho * override layoutxml test * add mbart * move special tokens list * format * Revert "format" This reverts commit 37a40df37903a932c2f951cbd33acb684246bae7. * simplify list of string of special tokens * Re-write `self.fairseq_tokens_to_ids ` initialization logic with special tokens Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com> Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>
This commit is contained in:
@@ -394,6 +394,33 @@ class TokenizerTesterMixin:
|
||||
self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs)
|
||||
self.check_subword_sampling(tokenizer_new)
|
||||
|
||||
def test_save_sentencepiece_tokenizer(self) -> None:
|
||||
if not self.test_sentencepiece or not self.test_slow_tokenizer:
|
||||
return
|
||||
# We want to verify that we will be able to save the tokenizer even if the original files that were used to
|
||||
# build the tokenizer have been deleted in the meantime.
|
||||
text = "This is text to test the tokenizer."
|
||||
|
||||
tokenizer_slow_1 = self.get_tokenizer()
|
||||
encoding_tokenizer_slow_1 = tokenizer_slow_1(text)
|
||||
|
||||
tmpdirname_1 = tempfile.mkdtemp()
|
||||
tmpdirname_2 = tempfile.mkdtemp()
|
||||
|
||||
tokenizer_slow_1.save_pretrained(tmpdirname_1)
|
||||
tokenizer_slow_2 = self.tokenizer_class.from_pretrained(tmpdirname_1)
|
||||
encoding_tokenizer_slow_2 = tokenizer_slow_2(text)
|
||||
|
||||
shutil.rmtree(tmpdirname_1)
|
||||
tokenizer_slow_2.save_pretrained(tmpdirname_2)
|
||||
|
||||
tokenizer_slow_3 = self.tokenizer_class.from_pretrained(tmpdirname_2)
|
||||
encoding_tokenizer_slow_3 = tokenizer_slow_3(text)
|
||||
shutil.rmtree(tmpdirname_2)
|
||||
|
||||
self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_2)
|
||||
self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_3)
|
||||
|
||||
def test_model_input_names_signature(self):
|
||||
accepted_model_main_input_names = [
|
||||
"input_ids", # nlp models
|
||||
|
||||
Reference in New Issue
Block a user