Run mlm pad to multiple for fp16 (#11128)

* Add mlm collator pad to multiple option (#10627) * Use padding to 8x in run mlm (#10627)
2021-04-08 22:12:49 +02:00
parent dfed4ec263
commit 6c40e49712
3 changed files with 67 additions and 9 deletions
--- a/tests/test_data_collator.py
+++ b/tests/test_data_collator.py
@@ -146,11 +146,8 @@ class DataCollatorIntegrationTest(unittest.TestCase):
        self.assertEqual(batch["labels"].shape, torch.Size([2, 6]))
        self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-1] * 3)

-    def test_data_collator_for_language_modeling(self):
+    def _test_no_pad_and_pad(self, no_pad_features, pad_features):
        tokenizer = BertTokenizer(self.vocab_file)
-        no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
-        pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
-
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        batch = data_collator(no_pad_features)
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
@@ -160,6 +157,15 @@ class DataCollatorIntegrationTest(unittest.TestCase):
        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))

+        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8)
+        batch = data_collator(no_pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        batch = data_collator(pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
        tokenizer._pad_token = None
        data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
        with self.assertRaises(ValueError):
@@ -185,6 +191,32 @@ class DataCollatorIntegrationTest(unittest.TestCase):
        self.assertTrue(torch.any(masked_tokens))
        self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))

+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(no_pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
+        self.assertTrue(torch.any(masked_tokens))
+        self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
+
+        batch = data_collator(pad_features)
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
+
+        masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
+        self.assertTrue(torch.any(masked_tokens))
+        self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
+
+    def test_data_collator_for_language_modeling(self):
+        no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
+        pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
+        self._test_no_pad_and_pad(no_pad_features, pad_features)
+
+        no_pad_features = [list(range(10)), list(range(10))]
+        pad_features = [list(range(5)), list(range(10))]
+        self._test_no_pad_and_pad(no_pad_features, pad_features)
+
    def test_plm(self):
        tokenizer = BertTokenizer(self.vocab_file)
        no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
@@ -225,6 +257,14 @@ class DataCollatorIntegrationTest(unittest.TestCase):
        self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
        self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))

+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(features)
+
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))
+
    def test_sop(self):
        tokenizer = BertTokenizer(self.vocab_file)
        features = [
@@ -242,3 +282,11 @@ class DataCollatorIntegrationTest(unittest.TestCase):
        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 5)))
        self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
        self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))
+
+        data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
+        batch = data_collator(features)
+
+        self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
+        self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))