Support multiple validation datasets when dataloader_persistent_workers=True (#30627)

* Support multiple validation datasets when dataloader_persistent_workers=True

* Test support of multiple validation datasets
This commit is contained in:
Bastien Le Chenadec
2024-06-17 17:58:39 +02:00
committed by GitHub
parent 147c404fb1
commit 485fd81471
2 changed files with 116 additions and 10 deletions

View File

@@ -1231,6 +1231,97 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
trainer.train()
trainer.evaluate()
def test_get_eval_dataloader_without_persistent_workers(self):
train_dataset = RegressionDataset()
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
tiny_gpt2 = GPT2LMHeadModel(config)
args = TrainingArguments("./test", report_to="none", dataloader_persistent_workers=False)
# Single evaluation dataset
eval_dataset = RegressionDataset()
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
# Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
trainer.accelerator.prepare = lambda x: x
default_dataloader = trainer.get_eval_dataloader()
dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset)
self.assertEqual(default_dataloader.dataset, eval_dataset)
self.assertEqual(dataloader_with_dataset.dataset, eval_dataset)
self.assertNotEqual(default_dataloader, dataloader_with_dataset)
# Multiple evaluation datasets
first_dataset = RegressionDataset()
second_dataset = RegressionDataset()
trainer = Trainer(
tiny_gpt2,
args,
train_dataset=train_dataset,
eval_dataset={"first": first_dataset, "second": second_dataset},
)
# Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
trainer.accelerator.prepare = lambda x: x
first_dataloader = trainer.get_eval_dataloader("first")
first_dataloader_repeated = trainer.get_eval_dataloader("first")
second_dataloader = trainer.get_eval_dataloader("second")
second_dataloader_repeated = trainer.get_eval_dataloader("second")
self.assertEqual(first_dataset, first_dataloader.dataset)
self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset)
self.assertEqual(second_dataset, second_dataloader.dataset)
self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset)
self.assertNotEqual(first_dataloader, first_dataloader_repeated)
self.assertNotEqual(second_dataloader, second_dataloader_repeated)
def test_get_eval_dataloader_with_persistent_workers(self):
train_dataset = RegressionDataset()
config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
tiny_gpt2 = GPT2LMHeadModel(config)
args = TrainingArguments(
"./test",
report_to="none",
dataloader_persistent_workers=True,
dataloader_num_workers=2,
)
# Single evaluation dataset
eval_dataset = RegressionDataset()
trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
# Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
trainer.accelerator.prepare = lambda x: x
default_dataloader = trainer.get_eval_dataloader()
dataloader_with_dataset = trainer.get_eval_dataloader(eval_dataset)
self.assertEqual(default_dataloader.dataset, eval_dataset)
self.assertEqual(dataloader_with_dataset.dataset, eval_dataset)
self.assertEqual(default_dataloader, dataloader_with_dataset)
# Multiple evaluation datasets
first_dataset = RegressionDataset()
second_dataset = RegressionDataset()
trainer = Trainer(
tiny_gpt2,
args,
train_dataset=train_dataset,
eval_dataset={"first": first_dataset, "second": second_dataset},
)
# Mocking the prepare method to avoid the dataloader changing with each call to get_eval_dataloader
trainer.accelerator.prepare = lambda x: x
first_dataloader = trainer.get_eval_dataloader("first")
first_dataloader_repeated = trainer.get_eval_dataloader("first")
second_dataloader = trainer.get_eval_dataloader("second")
second_dataloader_repeated = trainer.get_eval_dataloader("second")
self.assertEqual(first_dataset, first_dataloader.dataset)
self.assertEqual(first_dataloader.dataset, first_dataloader_repeated.dataset)
self.assertEqual(second_dataset, second_dataloader.dataset)
self.assertEqual(second_dataloader.dataset, second_dataloader_repeated.dataset)
self.assertEqual(first_dataloader, first_dataloader_repeated)
self.assertEqual(second_dataloader, second_dataloader_repeated)
@require_lomo
@require_torch_gpu
def test_lomo(self):