improve saving strategy of sentencepiece tokenizer (#15328)

* add new test

* add a feature to same the sentencepiece tokenizer model when the init file was deleted

* update marian

* update m2m_100

* fix marian

* update speech to text

* override test for layoutxlm

* fix saving bartpho

* remove harcoded values bartpho

* special token string version

* finish bartpho

* override layoutxml test

* add mbart

* move special tokens list

* format

* Revert "format"

This reverts commit 37a40df37903a932c2f951cbd33acb684246bae7.

* simplify list of string of special tokens

* Re-write `self.fairseq_tokens_to_ids ` initialization logic with special tokens

Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>

Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>
This commit is contained in:
SaulLu
2022-01-27 16:24:51 +01:00
committed by GitHub
parent 196cce6e9b
commit ade7371a41
21 changed files with 202 additions and 36 deletions

View File

@@ -394,6 +394,33 @@ class TokenizerTesterMixin:
self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs)
self.check_subword_sampling(tokenizer_new)
def test_save_sentencepiece_tokenizer(self) -> None:
if not self.test_sentencepiece or not self.test_slow_tokenizer:
return
# We want to verify that we will be able to save the tokenizer even if the original files that were used to
# build the tokenizer have been deleted in the meantime.
text = "This is text to test the tokenizer."
tokenizer_slow_1 = self.get_tokenizer()
encoding_tokenizer_slow_1 = tokenizer_slow_1(text)
tmpdirname_1 = tempfile.mkdtemp()
tmpdirname_2 = tempfile.mkdtemp()
tokenizer_slow_1.save_pretrained(tmpdirname_1)
tokenizer_slow_2 = self.tokenizer_class.from_pretrained(tmpdirname_1)
encoding_tokenizer_slow_2 = tokenizer_slow_2(text)
shutil.rmtree(tmpdirname_1)
tokenizer_slow_2.save_pretrained(tmpdirname_2)
tokenizer_slow_3 = self.tokenizer_class.from_pretrained(tmpdirname_2)
encoding_tokenizer_slow_3 = tokenizer_slow_3(text)
shutil.rmtree(tmpdirname_2)
self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_2)
self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_3)
def test_model_input_names_signature(self):
accepted_model_main_input_names = [
"input_ids", # nlp models