Add deepspeed test to amd scheduled CI (#27633)

* add deepspeed scheduled test for amd * fix image * add dockerfile * add comment * enable tests * trigger * remove trigger for this branch * trigger * change runner env to trigger the docker build image test * use new docker image * remove test suffix from docker image tag * replace test docker image with original image * push new image * Trigger * add back amd tests * fix typo * add amd tests back * fix * comment until docker image build scheduled test fix * remove deprecated deepspeed build option * upgrade torch * update docker & make tests pass * Update docker/transformers-pytorch-deepspeed-amd-gpu/Dockerfile * fix * tmp disable test * precompile deepspeed to avoid timeout during tests * fix comment * trigger deepspeed tests with new image * comment tests * trigger * add sklearn dependency to fix slow tests * enable back other tests * final update --------- Co-authored-by: Felix Marty <felix@hf.co> Co-authored-by: Félix Marty <9808326+fxmarty@users.noreply.github.com> Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2023-12-11 16:33:36 +01:00
parent 0f59d2f173
commit 39acfe84ba
10 changed files with 155 additions and 13 deletions
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -561,8 +561,8 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
        self.assertAlmostEqual(no_grad_accum_a, yes_grad_accum_a, places=5)
        self.assertAlmostEqual(no_grad_accum_b, yes_grad_accum_b, places=5)

-        # see the note above how to get identical loss on a small bs
-        self.assertAlmostEqual(no_grad_accum_loss, yes_grad_accum_loss, places=2)
+        # Relative difference. See the note above how to get identical loss on a small bs
+        self.assertTrue((no_grad_accum_loss - yes_grad_accum_loss) / (no_grad_accum_loss + 1e-15) <= 1e-3)

    def check_saved_checkpoints_deepspeed(self, output_dir, freq, total, stage, dtype):
        # adapted from TrainerIntegrationCommon.check_saved_checkpoints