Run mlm pad to multiple for fp16 (#11128)

* Add mlm collator pad to multiple option (#10627)

* Use padding to 8x in run mlm (#10627)
This commit is contained in:
Andrea Cappelli
2021-04-08 22:12:49 +02:00
committed by GitHub
parent dfed4ec263
commit 6c40e49712
3 changed files with 67 additions and 9 deletions

View File

@@ -146,11 +146,8 @@ class DataCollatorIntegrationTest(unittest.TestCase):
self.assertEqual(batch["labels"].shape, torch.Size([2, 6]))
self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-1] * 3)
def test_data_collator_for_language_modeling(self):
def _test_no_pad_and_pad(self, no_pad_features, pad_features):
tokenizer = BertTokenizer(self.vocab_file)
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
@@ -160,6 +157,15 @@ class DataCollatorIntegrationTest(unittest.TestCase):
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8)
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
batch = data_collator(pad_features)
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
tokenizer._pad_token = None
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
with self.assertRaises(ValueError):
@@ -185,6 +191,32 @@ class DataCollatorIntegrationTest(unittest.TestCase):
self.assertTrue(torch.any(masked_tokens))
self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
batch = data_collator(no_pad_features)
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(torch.any(masked_tokens))
self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
batch = data_collator(pad_features)
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
self.assertTrue(torch.any(masked_tokens))
self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
def test_data_collator_for_language_modeling(self):
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
self._test_no_pad_and_pad(no_pad_features, pad_features)
no_pad_features = [list(range(10)), list(range(10))]
pad_features = [list(range(5)), list(range(10))]
self._test_no_pad_and_pad(no_pad_features, pad_features)
def test_plm(self):
tokenizer = BertTokenizer(self.vocab_file)
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
@@ -225,6 +257,14 @@ class DataCollatorIntegrationTest(unittest.TestCase):
self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))
def test_sop(self):
tokenizer = BertTokenizer(self.vocab_file)
features = [
@@ -242,3 +282,11 @@ class DataCollatorIntegrationTest(unittest.TestCase):
self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 5)))
self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
batch = data_collator(features)
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))