From 619200cc42d1f55fb07b0d6e9c3fff79e30a2e06 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 6 May 2021 13:35:28 -0700 Subject: [PATCH] [cuda ext tests] fixing tests (#11619) * fixing tests * cleanup --- .github/workflows/self-scheduled.yml | 2 ++ tests/deepspeed/test_deepspeed.py | 7 ++++--- tests/extended/test_trainer_ext.py | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/.github/workflows/self-scheduled.yml b/.github/workflows/self-scheduled.yml index 3f15c3f4bb..bd034d9ee8 100644 --- a/.github/workflows/self-scheduled.yml +++ b/.github/workflows/self-scheduled.yml @@ -261,6 +261,7 @@ jobs: - name: Install dependencies run: | + apt -y update && apt install -y libaio-dev pip install --upgrade pip pip install .[testing,deepspeed] @@ -301,6 +302,7 @@ jobs: - name: Install dependencies run: | + apt -y update && apt install -y libaio-dev pip install --upgrade pip pip install .[testing,deepspeed,fairscale] diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index 0c829e5932..f345157b2f 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -318,9 +318,10 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): yes_grad_accum_b = yes_grad_accum_trainer.model.b.item() self.assertNotEqual(yes_grad_accum_a, a) - # training with half the batch size but accumulation steps as 2 should give the same weights - self.assertEqual(no_grad_accum_a, yes_grad_accum_a) - self.assertEqual(no_grad_accum_b, yes_grad_accum_b) + # training with half the batch size but accumulation steps as 2 should give the same + # weights, but sometimes get a slight difference still of 1e-6 + self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5) + self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5) # see the note above how to get identical loss on a small bs self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=5) diff --git a/tests/extended/test_trainer_ext.py b/tests/extended/test_trainer_ext.py index bae3587400..4cf16549c7 100644 --- a/tests/extended/test_trainer_ext.py +++ b/tests/extended/test_trainer_ext.py @@ -167,8 +167,8 @@ class TestTrainerExt(TestCasePlus): # test if do_predict saves generations and metrics contents = os.listdir(output_dir) contents = {os.path.basename(p) for p in contents} - assert "test_generations.txt" in contents - assert "test_results.json" in contents + assert "generated_predictions.txt" in contents + assert "predict_results.json" in contents def run_trainer( self,