Run mlm pad to multiple for fp16 (#11128)
* Add mlm collator pad to multiple option (#10627) * Use padding to 8x in run mlm (#10627)
This commit is contained in:
@@ -146,11 +146,8 @@ class DataCollatorIntegrationTest(unittest.TestCase):
|
||||
self.assertEqual(batch["labels"].shape, torch.Size([2, 6]))
|
||||
self.assertEqual(batch["labels"][0].tolist(), [0, 1, 2] + [-1] * 3)
|
||||
|
||||
def test_data_collator_for_language_modeling(self):
|
||||
def _test_no_pad_and_pad(self, no_pad_features, pad_features):
|
||||
tokenizer = BertTokenizer(self.vocab_file)
|
||||
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
|
||||
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
|
||||
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
|
||||
batch = data_collator(no_pad_features)
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
|
||||
@@ -160,6 +157,15 @@ class DataCollatorIntegrationTest(unittest.TestCase):
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 10)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 10)))
|
||||
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False, pad_to_multiple_of=8)
|
||||
batch = data_collator(no_pad_features)
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
|
||||
|
||||
batch = data_collator(pad_features)
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
|
||||
|
||||
tokenizer._pad_token = None
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)
|
||||
with self.assertRaises(ValueError):
|
||||
@@ -185,6 +191,32 @@ class DataCollatorIntegrationTest(unittest.TestCase):
|
||||
self.assertTrue(torch.any(masked_tokens))
|
||||
self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
|
||||
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
|
||||
batch = data_collator(no_pad_features)
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
|
||||
|
||||
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
|
||||
self.assertTrue(torch.any(masked_tokens))
|
||||
self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
|
||||
|
||||
batch = data_collator(pad_features)
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 16)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 16)))
|
||||
|
||||
masked_tokens = batch["input_ids"] == tokenizer.mask_token_id
|
||||
self.assertTrue(torch.any(masked_tokens))
|
||||
self.assertTrue(all(x == -100 for x in batch["labels"][~masked_tokens].tolist()))
|
||||
|
||||
def test_data_collator_for_language_modeling(self):
|
||||
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
|
||||
pad_features = [{"input_ids": list(range(5))}, {"input_ids": list(range(10))}]
|
||||
self._test_no_pad_and_pad(no_pad_features, pad_features)
|
||||
|
||||
no_pad_features = [list(range(10)), list(range(10))]
|
||||
pad_features = [list(range(5)), list(range(10))]
|
||||
self._test_no_pad_and_pad(no_pad_features, pad_features)
|
||||
|
||||
def test_plm(self):
|
||||
tokenizer = BertTokenizer(self.vocab_file)
|
||||
no_pad_features = [{"input_ids": list(range(10))}, {"input_ids": list(range(10))}]
|
||||
@@ -225,6 +257,14 @@ class DataCollatorIntegrationTest(unittest.TestCase):
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
|
||||
self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))
|
||||
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
|
||||
batch = data_collator(features)
|
||||
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
|
||||
self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
|
||||
self.assertEqual(batch["next_sentence_label"].shape, torch.Size((2,)))
|
||||
|
||||
def test_sop(self):
|
||||
tokenizer = BertTokenizer(self.vocab_file)
|
||||
features = [
|
||||
@@ -242,3 +282,11 @@ class DataCollatorIntegrationTest(unittest.TestCase):
|
||||
self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 5)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 5)))
|
||||
self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))
|
||||
|
||||
data_collator = DataCollatorForLanguageModeling(tokenizer, pad_to_multiple_of=8)
|
||||
batch = data_collator(features)
|
||||
|
||||
self.assertEqual(batch["input_ids"].shape, torch.Size((2, 8)))
|
||||
self.assertEqual(batch["token_type_ids"].shape, torch.Size((2, 8)))
|
||||
self.assertEqual(batch["labels"].shape, torch.Size((2, 8)))
|
||||
self.assertEqual(batch["sentence_order_label"].shape, torch.Size((2,)))
|
||||
|
||||
Reference in New Issue
Block a user