From f566c6e3b71b14b1933cb3eeaf3cb57de1dc75bc Mon Sep 17 00:00:00 2001 From: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Date: Thu, 23 Dec 2021 13:59:33 -0500 Subject: [PATCH] Fix failing GPU trainer tests (#14903) * Fix failing GPU trainer tests * Remove print statements --- tests/extended/test_trainer_ext.py | 2 ++ tests/test_trainer.py | 8 +------- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py index 3bc7a5c0bd..3a65f16580 100644 --- a/tests/extended/test_trainer_ext.py +++ b/tests/extended/test_trainer_ext.py @@ -130,6 +130,7 @@ class TestTrainerExt(TestCasePlus): self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp simple") # test --sharded_ddp w/ --fp16 + @unittest.skip("Requires an update of the env running those tests") @require_torch_multi_gpu @require_fairscale def test_run_seq2seq_sharded_ddp_fp16(self): @@ -142,6 +143,7 @@ class TestTrainerExt(TestCasePlus): self.run_seq2seq_quick(distributed=True, extra_args_str="--sharded_ddp zero_dp_2", predict_with_generate=False) # test --sharded_ddp zero_dp_2 w/ --fp16 + @unittest.skip("Requires an update of the env running those tests") @require_torch_multi_gpu @require_fairscale def test_run_seq2seq_fully_sharded_ddp_fp16(self): diff --git a/tests/test_trainer.py b/tests/test_trainer.py index 34935aac27..1beec9017d 100644 --- a/tests/test_trainer.py +++ b/tests/test_trainer.py @@ -1093,17 +1093,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon): self.assertIsInstance(loader.sampler, torch.utils.data.dataloader._InfiniteConstantSampler) def test_training_finite_iterable_dataset(self): - num_gpus = max(1, get_gpu_count()) - if num_gpus > 2: - return - config = RegressionModelConfig() model = RegressionPreTrainedModel(config) batch_size = 1 num_samples = 10 - available_steps = num_samples // (batch_size * num_gpus) + available_steps = num_samples // batch_size data = FiniteIterableDataset(length=num_samples) train_args = TrainingArguments( @@ -1510,7 +1506,6 @@ class TrainerIntegrationWithHubTester(unittest.TestCase): expected_commits = [f"Training in progress, epoch {i}" for i in range(3, 0, -1)] expected_commits.append("initial commit") self.assertListEqual(commits, expected_commits) - print(commits, len(commits)) def test_push_to_hub_with_saves_each_n_steps(self): num_gpus = max(1, get_gpu_count()) @@ -1534,7 +1529,6 @@ class TrainerIntegrationWithHubTester(unittest.TestCase): expected_commits = [f"Training in progress, step {i}" for i in range(total_steps, 0, -5)] expected_commits.append("initial commit") self.assertListEqual(commits, expected_commits) - print(commits, len(commits)) @require_torch