support new marian models (#15831)

* support not sharing embeddings

* update modeling

* update tokenizer

* fix conversion script

* always use self.shared

* boom boom

* begin tests

* update tests

* fix resize_decoder_token_embeddings

* address Patrick's comments

* style

* update conversion script

* fix conversion script

* fix tokenizer

* better name target vocab

* add integration test for tokenizer with two vocabs

* style

* address Patrick's comments

* add integration test for model
This commit is contained in:
Suraj Patil
2022-03-10 19:41:56 +01:00
committed by GitHub
parent e66743e6c9
commit ba21001f4c
6 changed files with 386 additions and 63 deletions

View File

@@ -268,6 +268,58 @@ class MarianModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
model.generate(input_ids, attention_mask=attention_mask)
model.generate(num_beams=4, do_sample=True, early_stopping=False, num_return_sequences=3)
def test_share_encoder_decoder_embeddings(self):
config, input_dict = self.model_tester.prepare_config_and_inputs()
# check if embeddings are shared by default
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIs(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
self.assertIs(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
# check if embeddings are not shared when config.share_encoder_decoder_embeddings = False
config.share_encoder_decoder_embeddings = False
for model_class in self.all_model_classes:
model = model_class(config)
self.assertIsNot(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
self.assertIsNot(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
# check if a model with shared embeddings can be saved and loaded with share_encoder_decoder_embeddings = False
config, _ = self.model_tester.prepare_config_and_inputs()
for model_class in self.all_model_classes:
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(tmpdirname, share_encoder_decoder_embeddings=False)
self.assertIsNot(model.get_encoder().embed_tokens, model.get_decoder().embed_tokens)
self.assertIsNot(model.get_encoder().embed_tokens.weight, model.get_decoder().embed_tokens.weight)
def test_resize_decoder_token_embeddings(self):
config, _ = self.model_tester.prepare_config_and_inputs()
# check if resize_decoder_token_embeddings raises an error when embeddings are shared
for model_class in self.all_model_classes:
model = model_class(config)
with self.assertRaises(ValueError):
model.resize_decoder_token_embeddings(config.vocab_size + 1)
# check if decoder embeddings are resized when config.share_encoder_decoder_embeddings = False
config.share_encoder_decoder_embeddings = False
for model_class in self.all_model_classes:
model = model_class(config)
model.resize_decoder_token_embeddings(config.vocab_size + 1)
self.assertEqual(model.get_decoder().embed_tokens.weight.shape, (config.vocab_size + 1, config.d_model))
# check if lm_head is also resized
config, _ = self.model_tester.prepare_config_and_inputs()
config.share_encoder_decoder_embeddings = False
model = MarianMTModel(config)
model.resize_decoder_token_embeddings(config.vocab_size + 1)
self.assertEqual(model.lm_head.weight.shape, (config.vocab_size + 1, config.d_model))
def test_tie_word_embeddings_decoder(self):
pass
def assert_tensors_close(a, b, atol=1e-12, prefix=""):
"""If tensors have different shapes, different values or a and b are not both tensors, raise a nice Assertion error."""
@@ -529,6 +581,27 @@ class TestMarian_en_ROMANCE(MarianIntegrationTest):
self.assertEqual(self.expected_text, [x["translation_text"] for x in output])
@require_sentencepiece
@require_tokenizers
class TestMarian_FI_EN_V2(MarianIntegrationTest):
src = "fi"
tgt = "en"
src_text = [
"minä tykkään kirjojen lukemisesta",
"Pidän jalkapallon katsomisesta",
]
expected_text = ["I like to read books", "I like watching football"]
@classmethod
def setUpClass(cls) -> None:
cls.model_name = "hf-internal-testing/test-opus-tatoeba-fi-en-v2"
return cls
@slow
def test_batch_generation_en_fr(self):
self._assert_generated_batch_equal_expected()
@require_torch
class TestConversionUtils(unittest.TestCase):
def test_renaming_multilingual(self):

View File

@@ -134,3 +134,22 @@ class MarianTokenizationTest(TokenizerTesterMixin, unittest.TestCase):
revision="1a8c2263da11e68e50938f97e10cd57820bd504c",
decode_kwargs={"use_source_tokenizer": True},
)
def test_tokenizer_integration_seperate_vocabs(self):
tokenizer = MarianTokenizer.from_pretrained("hf-internal-testing/test-marian-two-vocabs")
source_text = "Tämä on testi"
target_text = "This is a test"
expected_src_ids = [76, 7, 2047, 2]
expected_target_ids = [69, 12, 11, 940, 2]
src_ids = tokenizer(source_text).input_ids
self.assertListEqual(src_ids, expected_src_ids)
with tokenizer.as_target_tokenizer():
target_ids = tokenizer(target_text).input_ids
self.assertListEqual(target_ids, expected_target_ids)
decoded = tokenizer.decode(target_ids, skip_special_tokens=True)
self.assertEqual(decoded, target_text)