[M2M100] fix positional embeddings (#10590)

* fix tests * emb should be a parameter * fix positional embeddings * fix make_weights * don't save pos embeds * add comment to describe the clamping
2021-03-08 16:06:19 +05:30
parent d59464db6b
commit 2a737bffef
2 changed files with 29 additions and 9 deletions
--- a/tests/test_modeling_m2m_100.py
+++ b/tests/test_modeling_m2m_100.py
@@ -96,13 +96,19 @@ class M2M100ModelTester:

    def prepare_config_and_inputs(self):
        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
-        input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size).clamp(
-            3,
-        )
        input_ids[:, -1] = self.eos_token_id  # Eos Token
-
        decoder_input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)

+        # we need to clamp the input ids here to avoid having pad token in between
+        # this is because for M2M100 the position_ids are prepared such that
+        # all pad tokens have pos id = 2 and rest are between 2..seq_length
+        # and the seq_length here is seq_length - num_pad_tokens
+        # but when using past, there is no way of knowing if the past input ids had
+        # pad tokens in them, which results in incorrect seq_lenth and which in turn results in
+        # position_ids being off by num_pad_tokens in past input
+        input_ids = input_ids.clamp(self.pad_token_id + 1)
+        decoder_input_ids = decoder_input_ids.clamp(self.pad_token_id + 1)
+
        config = M2M100Config(
            vocab_size=self.vocab_size,
            d_model=self.hidden_size,