support new marian models (#15831)

* support not sharing embeddings

* update modeling

* update tokenizer

* fix conversion script

* always use self.shared

* boom boom

* begin tests

* update tests

* fix resize_decoder_token_embeddings

* address Patrick's comments

* style

* update conversion script

* fix conversion script

* fix tokenizer

* better name target vocab

* add integration test for tokenizer with two vocabs

* style

* address Patrick's comments

* add integration test for model
This commit is contained in:
Suraj Patil
2022-03-10 19:41:56 +01:00
committed by GitHub
parent e66743e6c9
commit ba21001f4c
6 changed files with 386 additions and 63 deletions

View File

@@ -268,6 +268,58 @@ class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
model.generate(input_ids, attention_mask=attention_mask)
model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
def test_share_encoder_decoder_embeddings(self):
config, input_dict = self.model_tester.prepare_config_and_inputs()
# check if embeddings are shared by default
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIs(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
self.assertIs(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
# check if embeddings are not shared when config.share_encoder_decoder_embeddings = False
config.share_encoder_decoder_embeddings = False
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsNot(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
self.assertIsNot(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
# check if a model with shared embeddings can be saved and loaded with share_encoder_decoder_embeddings = False
config, _ = self.model_tester.prepare_config_and_inputs()
for model_class in self.all_model_classes:
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(tmpdirname, share_encoder_decoder_embeddings=False)
self.assertIsNot(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
self.assertIsNot(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
def test_resize_decoder_token_embeddings(self):
config, _ = self.model_tester.prepare_config_and_inputs()
# check if resize_decoder_token_embeddings raises an error when embeddings are shared
for model_class in self.all_model_classes:
model = model_class(config)
with self.assertRaises(ValueError):
model.resize_decoder_token_embeddings(config.vocab_size + 1)
# check if decoder embeddings are resized when config.share_encoder_decoder_embeddings = False
config.share_encoder_decoder_embeddings = False
for model_class in self.all_model_classes:
model = model_class(config)
model.resize_decoder_token_embeddings(config.vocab_size + 1)
self.assertEqual(model.get_decoder().embed_tokens.weight.shape, (config.vocab_size + 1, config.d_model))
# check if lm_head is also resized
config, _ = self.model_tester.prepare_config_and_inputs()
config.share_encoder_decoder_embeddings = False
model = MarianMTModel(config)
model.resize_decoder_token_embeddings(config.vocab_size + 1)
self.assertEqual(model.lm_head.weight.shape, (config.vocab_size + 1, config.d_model))
def test_tie_word_embeddings_decoder(self):
pass
def assert_tensors_close(a, b, atol=1e-12, prefix=""):
"""If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
@@ -529,6 +581,27 @@ class TestMarian_en_ROMANCE(MarianIntegrationTest):
self.assertEqual(self.expected_text, [x["translation_text"] for x in output])
@require_sentencepiece
@require_tokenizers
class TestMarian_FI_EN_V2(MarianIntegrationTest):
src = "fi"
tgt = "en"
src_text = [
"minä tykkään kirjojen lukemisesta",
"Pidän jalkapallon katsomisesta",
]
expected_text = ["I like to read books", "I like watching football"]
@classmethod
def setUpClass(cls) -> None:
cls.model_name = "hf-internal-testing/test-opus-tatoeba-fi-en-v2"
return cls
@slow
def test_batch_generation_en_fr(self):
self._assert_generated_batch_equal_expected()
@require_torch
class TestConversionUtils(unittest.TestCase):
def test_renaming_multilingual(self):