improve saving strategy of sentencepiece tokenizer (#15328)

* add new test

* add a feature to same the sentencepiece tokenizer model when the init file was deleted

* update marian

* update m2m_100

* fix marian

* update speech to text

* override test for layoutxlm

* fix saving bartpho

* remove harcoded values bartpho

* special token string version

* finish bartpho

* override layoutxml test

* add mbart

* move special tokens list

* format

* Revert "format"

This reverts commit 37a40df37903a932c2f951cbd33acb684246bae7.

* simplify list of string of special tokens

* Re-write `self.fairseq_tokens_to_ids ` initialization logic with special tokens

Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>

Co-authored-by: Sylvain Gugger <sylvain.gugger@gmail.com>
This commit is contained in:
SaulLu
2022-01-27 16:24:51 +01:00
committed by GitHub
parent 196cce6e9b
commit ade7371a41
21 changed files with 202 additions and 36 deletions

View File

@@ -394,6 +394,33 @@ class TokenizerTesterMixin:
self.assertEqual(tokenizer_new.sp_model_kwargs, sp_model_kwargs)
self.check_subword_sampling(tokenizer_new)
def test_save_sentencepiece_tokenizer(self) -> None:
if not self.test_sentencepiece or not self.test_slow_tokenizer:
return
# We want to verify that we will be able to save the tokenizer even if the original files that were used to
# build the tokenizer have been deleted in the meantime.
text = "This is text to test the tokenizer."
tokenizer_slow_1 = self.get_tokenizer()
encoding_tokenizer_slow_1 = tokenizer_slow_1(text)
tmpdirname_1 = tempfile.mkdtemp()
tmpdirname_2 = tempfile.mkdtemp()
tokenizer_slow_1.save_pretrained(tmpdirname_1)
tokenizer_slow_2 = self.tokenizer_class.from_pretrained(tmpdirname_1)
encoding_tokenizer_slow_2 = tokenizer_slow_2(text)
shutil.rmtree(tmpdirname_1)
tokenizer_slow_2.save_pretrained(tmpdirname_2)
tokenizer_slow_3 = self.tokenizer_class.from_pretrained(tmpdirname_2)
encoding_tokenizer_slow_3 = tokenizer_slow_3(text)
shutil.rmtree(tmpdirname_2)
self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_2)
self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_3)
def test_model_input_names_signature(self):
accepted_model_main_input_names = [
"input_ids", # nlp models

View File

@@ -99,6 +99,44 @@ class LayoutXLMTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
output_text = "unwanted, running"
return input_text, output_text
# override test in `test_tokenization_common.py` because of the required input format of the `__call__`` method of
# this tokenizer
def test_save_sentencepiece_tokenizer(self) -> None:
if not self.test_sentencepiece or not self.test_slow_tokenizer:
return
# We want to verify that we will be able to save the tokenizer even if the original files that were used to
# build the tokenizer have been deleted in the meantime.
words, boxes = self.get_words_and_boxes()
tokenizer_slow_1 = self.get_tokenizer()
encoding_tokenizer_slow_1 = tokenizer_slow_1(
words,
boxes=boxes,
)
tmpdirname_1 = tempfile.mkdtemp()
tmpdirname_2 = tempfile.mkdtemp()
tokenizer_slow_1.save_pretrained(tmpdirname_1)
tokenizer_slow_2 = self.tokenizer_class.from_pretrained(tmpdirname_1)
encoding_tokenizer_slow_2 = tokenizer_slow_2(
words,
boxes=boxes,
)
shutil.rmtree(tmpdirname_1)
tokenizer_slow_2.save_pretrained(tmpdirname_2)
tokenizer_slow_3 = self.tokenizer_class.from_pretrained(tmpdirname_2)
encoding_tokenizer_slow_3 = tokenizer_slow_3(
words,
boxes=boxes,
)
shutil.rmtree(tmpdirname_2)
self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_2)
self.assertEqual(encoding_tokenizer_slow_1, encoding_tokenizer_slow_3)
@slow
def test_sequence_builders(self):
tokenizer = self.tokenizer_class.from_pretrained("microsoft/layoutxlm-base")

View File

@@ -39,6 +39,7 @@ class MBartTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
tokenizer_class = MBartTokenizer
rust_tokenizer_class = MBartTokenizerFast
test_rust_tokenizer = True
test_sentencepiece = True
def setUp(self):
super().setUp()