support new marian models (#15831)
* support not sharing embeddings * update modeling * update tokenizer * fix conversion script * always use self.shared * boom boom * begin tests * update tests * fix resize_decoder_token_embeddings * address Patrick's comments * style * update conversion script * fix conversion script * fix tokenizer * better name target vocab * add integration test for tokenizer with two vocabs * style * address Patrick's comments * add integration test for model
This commit is contained in:
@@ -268,6 +268,58 @@ class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
|
||||
model.generate(input_ids, attention_mask=attention_mask)
|
||||
model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
|
||||
|
||||
def test_share_encoder_decoder_embeddings(self):
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs()
|
||||
|
||||
# check if embeddings are shared by default
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
self.assertIs(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
|
||||
self.assertIs(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
|
||||
|
||||
# check if embeddings are not shared when config.share_encoder_decoder_embeddings = False
|
||||
config.share_encoder_decoder_embeddings = False
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
self.assertIsNot(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
|
||||
self.assertIsNot(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
|
||||
|
||||
# check if a model with shared embeddings can be saved and loaded with share_encoder_decoder_embeddings = False
|
||||
config, _ = self.model_tester.prepare_config_and_inputs()
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
model = model_class.from_pretrained(tmpdirname, share_encoder_decoder_embeddings=False)
|
||||
self.assertIsNot(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
|
||||
self.assertIsNot(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
|
||||
|
||||
def test_resize_decoder_token_embeddings(self):
|
||||
config, _ = self.model_tester.prepare_config_and_inputs()
|
||||
|
||||
# check if resize_decoder_token_embeddings raises an error when embeddings are shared
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
with self.assertRaises(ValueError):
|
||||
model.resize_decoder_token_embeddings(config.vocab_size + 1)
|
||||
|
||||
# check if decoder embeddings are resized when config.share_encoder_decoder_embeddings = False
|
||||
config.share_encoder_decoder_embeddings = False
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.resize_decoder_token_embeddings(config.vocab_size + 1)
|
||||
self.assertEqual(model.get_decoder().embed_tokens.weight.shape, (config.vocab_size + 1, config.d_model))
|
||||
|
||||
# check if lm_head is also resized
|
||||
config, _ = self.model_tester.prepare_config_and_inputs()
|
||||
config.share_encoder_decoder_embeddings = False
|
||||
model = MarianMTModel(config)
|
||||
model.resize_decoder_token_embeddings(config.vocab_size + 1)
|
||||
self.assertEqual(model.lm_head.weight.shape, (config.vocab_size + 1, config.d_model))
|
||||
|
||||
def test_tie_word_embeddings_decoder(self):
|
||||
pass
|
||||
|
||||
|
||||
def assert_tensors_close(a, b, atol=1e-12, prefix=""):
|
||||
"""If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
|
||||
@@ -529,6 +581,27 @@ class TestMarian_en_ROMANCE(MarianIntegrationTest):
|
||||
self.assertEqual(self.expected_text, [x["translation_text"] for x in output])
|
||||
|
||||
|
||||
@require_sentencepiece
|
||||
@require_tokenizers
|
||||
class TestMarian_FI_EN_V2(MarianIntegrationTest):
|
||||
src = "fi"
|
||||
tgt = "en"
|
||||
src_text = [
|
||||
"minä tykkään kirjojen lukemisesta",
|
||||
"Pidän jalkapallon katsomisesta",
|
||||
]
|
||||
expected_text = ["I like to read books", "I like watching football"]
|
||||
|
||||
@classmethod
|
||||
def setUpClass(cls) -> None:
|
||||
cls.model_name = "hf-internal-testing/test-opus-tatoeba-fi-en-v2"
|
||||
return cls
|
||||
|
||||
@slow
|
||||
def test_batch_generation_en_fr(self):
|
||||
self._assert_generated_batch_equal_expected()
|
||||
|
||||
|
||||
@require_torch
|
||||
class TestConversionUtils(unittest.TestCase):
|
||||
def test_renaming_multilingual(self):
|
||||
|
||||
Reference in New Issue
Block a user