Use inherit tempdir makers for tests + fix failing DS tests (#35600)

* Use existing APIs to make tempdir folders * Fixup deepspeed too * output_dir -> tmp_dir
2025-01-10 10:01:58 -05:00
parent bbc00046b9
commit 1211e616a4
2 changed files with 636 additions and 620 deletions
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -482,6 +482,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
                max_grad_norm=max_grad_norm,
                adam_beta1=adam_beta1,
                adam_beta2=adam_beta2,
                output_dir=self.get_auto_remove_tmp_dir(),
            )
            with self.assertRaises(Exception) as context:
                trainer.train()
@@ -506,7 +507,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
            del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
            ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            trainer = get_regression_trainer(
                a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
            )
            trainer.train()
        new_a = trainer.model.a.item()
        self.assertNotEqual(new_a, a)
@@ -518,7 +521,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
            del ds_config_zero2_dict["optimizer"]  # force default HF Trainer optimizer
            ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            trainer = get_regression_trainer(
                a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
            )
            trainer.train()
        new_a = trainer.model.a.item()
        self.assertNotEqual(new_a, a)
@@ -530,7 +535,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
            del ds_config_zero2_dict["scheduler"]  # force default HF Trainer scheduler
            ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
            ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1  # force optimizer on the first step
-            trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
+            trainer = get_regression_trainer(
                a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict, output_dir=self.get_auto_remove_tmp_dir()
            )
            trainer.train()
        new_a = trainer.model.a.item()
        self.assertNotEqual(new_a, a)
@@ -546,7 +553,9 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
            ds_config_zero3_dict["zero_optimization"]["offload_optimizer"] = nvme_config
            ds_config_zero3_dict["zero_optimization"]["offload_param"] = nvme_config
            ds_config_zero3_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
-            trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict)
+            trainer = get_regression_trainer(
                local_rank=0, fp16=True, deepspeed=ds_config_zero3_dict, output_dir=self.get_auto_remove_tmp_dir()
            )
            with CaptureLogger(deepspeed_logger) as cl:
                trainer.train()
            self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
@@ -567,6 +576,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
                fp16=True,
                model_init=model_init,
                deepspeed=ds_config_zero3_dict,
                output_dir=self.get_auto_remove_tmp_dir(),
            )
            n_trials = 3
@@ -588,7 +598,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
        ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
        ds_config_dict["zero_force_ds_cpu_optimizer"] = False  # offload is not efficient w/o CPUAdam
        with mockenv_context(**self.dist_env_1_gpu):
-            kwargs = {"local_rank": 0, "deepspeed": ds_config_dict}
+            kwargs = {"local_rank": 0, "deepspeed": ds_config_dict, "output_dir": self.get_auto_remove_tmp_dir()}
            kwargs[dtype] = True
            trainer = get_regression_trainer(**kwargs)
            with CaptureLogger(deepspeed_logger) as cl:
@@ -604,7 +614,11 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
        # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
        # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
        with mockenv_context(**self.dist_env_1_gpu):
-            kwargs = {"local_rank": 0, "deepspeed": self.get_config_dict(stage)}
+            kwargs = {
                "local_rank": 0,
                "deepspeed": self.get_config_dict(stage),
                "output_dir": self.get_auto_remove_tmp_dir(),
            }
            kwargs[dtype] = True
            trainer = get_regression_trainer(**kwargs)
@@ -630,6 +644,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
                "deepspeed": self.get_config_dict(stage),
                "per_device_train_batch_size": 8,
                "logging_steps": 1,
                "output_dir": self.get_auto_remove_tmp_dir(),
            }
            kwargs[dtype] = True
            trainer = get_regression_trainer(**kwargs)
@@ -673,6 +688,7 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
            "local_rank": 0,
            "train_len": train_len,
            "deepspeed": self.get_config_dict(stage),
            "output_dir": self.get_auto_remove_tmp_dir(),
        }
        kwargs[dtype] = True
--- a/tests/trainer/test_trainer.py
+++ b/tests/trainer/test_trainer.py
@@ -1222,8 +1222,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        train_dataset = RegressionDataset()
        eval_dataset = RegressionDataset()
        model = RegressionDictModel()
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none")
            args = TrainingArguments(tmp_dir, report_to="none")
        trainer = Trainer(model, args, train_dataset=train_dataset, eval_dataset=eval_dataset)
        trainer.train()
        _ = trainer.evaluate()
@@ -1234,8 +1233,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        tiny_gpt2 = GPT2LMHeadModel(config)
        x = torch.randint(0, 100, (128,))
        eval_dataset = RepeatDataset(x)
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none")
            args = TrainingArguments(tmp_dir, report_to="none")
        trainer = Trainer(tiny_gpt2, args, eval_dataset=eval_dataset)
        # By default the past_key_values are removed
        result = trainer.predict(eval_dataset)
@@ -1246,7 +1244,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        self.assertEqual(len(result.predictions), 2)
    def test_training_arguments_are_left_untouched(self):
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        tmp_dir = self.get_auto_remove_tmp_dir()
        trainer = get_regression_trainer(output_dir=tmp_dir)
        trainer.train()
        args = TrainingArguments(tmp_dir, report_to=[])
@@ -1258,7 +1256,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
    def test_number_of_steps_in_training(self):
        # Regular training has n_epochs * len(train_dl) steps
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        tmp_dir = self.get_auto_remove_tmp_dir()
        trainer = get_regression_trainer(learning_rate=0.1, output_dir=tmp_dir)
        train_output = trainer.train()
        self.assertEqual(train_output.global_step, self.n_epochs * 64 / self.batch_size)
@@ -1277,7 +1275,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
    @require_intel_extension_for_pytorch
    def test_number_of_steps_in_training_with_ipex(self):
        for mix_bf16 in [True, False]:
-            with tempfile.TemporaryDirectory() as tmp_dir:
+            tmp_dir = self.get_auto_remove_tmp_dir()
            # Regular training has n_epochs * len(train_dl) steps
            trainer = get_regression_trainer(
                learning_rate=0.1, use_ipex=True, bf16=mix_bf16, use_cpu=True, output_dir=tmp_dir
@@ -1311,9 +1309,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmp_dir:
        args = TrainingArguments(
-                tmp_dir,
+            self.get_auto_remove_tmp_dir(),
            per_device_train_batch_size=2,
            torch_compile=True,
            max_steps=1,  # compile happens on the first step
@@ -1348,9 +1345,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmp_dir:
        args = TrainingArguments(
-                tmp_dir,
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
        )
@@ -1387,9 +1383,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        tokenizer.pad_token = tokenizer.eos_token
-        with tempfile.TemporaryDirectory() as tmpdir:
+        tmp_dir = self.get_auto_remove_tmp_dir()
        args = TrainingArguments(
-                tmpdir,
+            tmp_dir,
            per_device_train_batch_size=1,
            learning_rate=1e-9,
            save_steps=5,
@@ -1406,7 +1402,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        # Reinitialize trainer
        trainer = Trainer(tiny_model, args, processing_class=tokenizer, train_dataset=train_dataset)
-            checkpoint = os.path.join(tmpdir, "checkpoint-5")
+        checkpoint = os.path.join(tmp_dir, "checkpoint-5")
        trainer.train(resume_from_checkpoint=checkpoint)
        parameters1 = dict(tiny_model.named_parameters())
@@ -1421,10 +1417,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb"
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            logging_nan_inf_filter=False,
            optim="rmsprop_bnb",
        )
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
@@ -1438,10 +1437,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="ademamix"
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            logging_nan_inf_filter=False,
            optim="ademamix",
        )
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
@@ -1455,10 +1457,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="ademamix_8bit"
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            logging_nan_inf_filter=False,
            optim="ademamix_8bit",
        )
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
@@ -1472,10 +1477,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb_8bit"
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            logging_nan_inf_filter=False,
            optim="rmsprop_bnb_8bit",
        )
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
@@ -1488,10 +1496,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        tiny_gpt2 = GPT2LMHeadModel(config)
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir, learning_rate=1e-9, logging_steps=5, logging_nan_inf_filter=False, optim="rmsprop_bnb_32bit"
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            logging_nan_inf_filter=False,
            optim="rmsprop_bnb_32bit",
        )
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
@@ -1505,9 +1516,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        train_dataset = RepeatDataset(x)
        # Trainer without inf/nan filter
        with tempfile.TemporaryDirectory() as tmp_dir:
        args = TrainingArguments(
-                tmp_dir,
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            logging_nan_inf_filter=False,
@@ -1528,9 +1538,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        # redefine the model
        tiny_gpt2 = GPT2LMHeadModel(config)
        # Trainer without inf/nan filter
        with tempfile.TemporaryDirectory() as tmp_dir:
        args = TrainingArguments(
-                tmp_dir,
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            logging_nan_inf_filter=False,
@@ -1561,18 +1570,24 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        train_dataset = RepeatDataset(x)
        # Trainer without inf/nan filter
        with tempfile.TemporaryDirectory() as tmp_dir:
        args = TrainingArguments(
-                tmp_dir, learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=False, report_to="none"
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e9,
            logging_steps=5,
            logging_nan_inf_filter=False,
            report_to="none",
        )
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
        trainer.train()
        log_history_no_filter = trainer.state.log_history
        # Trainer with inf/nan filter
        with tempfile.TemporaryDirectory() as tmp_dir:
        args = TrainingArguments(
-                tmp_dir, learning_rate=1e9, logging_steps=5, logging_nan_inf_filter=True, report_to="none"
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e9,
            logging_steps=5,
            logging_nan_inf_filter=True,
            report_to="none",
        )
        trainer = Trainer(tiny_gpt2, args, train_dataset=train_dataset)
        trainer.train()
@@ -1591,7 +1606,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        else:
            n_gpu = 1
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        tmp_dir = self.get_auto_remove_tmp_dir()
        trainer = get_regression_trainer(learning_rate=0.1, per_device_train_batch_size=16, output_dir=tmp_dir)
        self.assertEqual(trainer.get_train_dataloader().total_batch_size, 16 * n_gpu)
        trainer = get_regression_trainer(learning_rate=0.1, per_device_eval_batch_size=16, output_dir=tmp_dir)
@@ -1628,12 +1643,11 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
    # tests that we do not require dataloader to have a .dataset attribute
    def test_dataloader_without_dataset(self):
        train_dataset = RegressionDataset(length=128)
        with tempfile.TemporaryDirectory() as tmp_dir:
        trainer = CustomDataloaderTrainer(
            model=RegressionModel(),
            train_dataset=train_dataset,
            eval_dataset=train_dataset,
-                args=TrainingArguments(output_dir=tmp_dir, report_to="none"),
+            args=TrainingArguments(output_dir=self.get_auto_remove_tmp_dir(), report_to="none"),
        )
        trainer.train()
@@ -1643,8 +1657,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        train_dataset = RegressionDataset()
        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
        tiny_gpt2 = GPT2LMHeadModel(config)
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        args = TrainingArguments(self.get_auto_remove_tmp_dir(), report_to="none", dataloader_persistent_workers=False)
            args = TrainingArguments(tmp_dir, report_to="none", dataloader_persistent_workers=False)
        # Single evaluation dataset
        eval_dataset = RegressionDataset()
@@ -1687,9 +1700,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        train_dataset = RegressionDataset()
        config = GPT2Config(vocab_size=100, n_positions=128, n_embd=32, n_layer=3, n_head=4)
        tiny_gpt2 = GPT2LMHeadModel(config)
        with tempfile.TemporaryDirectory() as tmp_dir:
        args = TrainingArguments(
-                tmp_dir,
+            self.get_auto_remove_tmp_dir(),
            report_to="none",
            dataloader_persistent_workers=True,
            dataloader_num_workers=2,
@@ -1747,9 +1759,8 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
            self.assertNotEqual(modeling_llama.apply_rotary_pos_emb, liger_rotary_pos_emb)
            self.assertFalse(isinstance(tiny_llama.model.norm, LigerRMSNorm))
            with tempfile.TemporaryDirectory() as tmp_dir:
            args = TrainingArguments(
-                    tmp_dir,
+                self.get_auto_remove_tmp_dir(),
                use_liger_kernel=True,
            )
            Trainer(tiny_llama, args)
@@ -1768,8 +1779,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
-        with tempfile.TemporaryDirectory() as tmpdir:
+        args = TrainingArguments(
-            args = TrainingArguments(tmpdir, learning_rate=1e-2, logging_steps=5, max_steps=20, use_liger_kernel=True)
+            self.get_auto_remove_tmp_dir(), learning_rate=1e-2, logging_steps=5, max_steps=20, use_liger_kernel=True
        )
        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
        # Check this works
@@ -1786,9 +1798,10 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        # Trainer without inf/nan filter
-            args = TrainingArguments(tmpdir, learning_rate=1e-2, logging_steps=5, optim="lomo", max_steps=20)
+        args = TrainingArguments(
            self.get_auto_remove_tmp_dir(), learning_rate=1e-2, logging_steps=5, optim="lomo", max_steps=20
        )
        trainer = Trainer(tiny_llama, args, train_dataset=train_dataset)
        # Check this works
@@ -1805,10 +1818,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir,
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            optim="adalomo",
@@ -1820,16 +1832,15 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
    @require_grokadamw
    @require_torch_gpu
-    def test_grokadamw():
+    def test_grokadamw(self):
        config = LlamaConfig(vocab_size=100, hidden_size=32, num_hidden_layers=3, num_attention_heads=4)
        tiny_llama = LlamaForCausalLM(config)
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir,
+            self.get_auto_remove_tmp_dir(),
            learning_rate=2e-5,
            logging_steps=5,
            optim="grokadamw",
@@ -1848,10 +1859,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir,
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            optim="schedule_free_adamw",
@@ -1950,10 +1960,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir,
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            optim="galore_adamw",
@@ -1972,10 +1981,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir,
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            optim="galore_adamw",
@@ -1995,10 +2003,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir,
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            optim="galore_adamw_layerwise",
@@ -2017,10 +2024,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir,
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            optim="galore_adamw_layerwise",
@@ -2040,10 +2046,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir,
+            self.get_auto_remove_tmp_dir(),
            learning_rate=1e-9,
            logging_steps=5,
            optim="galore_adamw_8bit",
@@ -2156,13 +2161,12 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        learning_rate = 1e-9
        num_steps = 10
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir,
+            self.get_auto_remove_tmp_dir(),
            learning_rate=learning_rate,
            logging_steps=5,
            optim="galore_adamw",
@@ -2182,14 +2186,13 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        x = torch.randint(0, 100, (128,))
        train_dataset = RepeatDataset(x)
        with tempfile.TemporaryDirectory() as tmpdir:
        learning_rate = 2e-4
        num_train_epochs = 2
        num_warmup_steps = 5
        # Trainer without inf/nan filter
        args = TrainingArguments(
-                tmpdir,
+            self.get_auto_remove_tmp_dir(),
            num_train_epochs=num_train_epochs,
            learning_rate=learning_rate,
            warmup_steps=num_warmup_steps,
@@ -2707,41 +2710,41 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
                self.assertNotIn(log_info_string, cl.out)
    def test_save_checkpoints(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
+        tmp_dir = self.get_auto_remove_tmp_dir()
-            trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5)
+        trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5)
        trainer.train()
-            self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size))
+        self.check_saved_checkpoints(tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size))
        # With a regular model that is not a PreTrainedModel
-        with tempfile.TemporaryDirectory() as tmpdir:
+        tmp_dir = self.get_auto_remove_tmp_dir()
-            trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, pretrained=False)
+        trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5, pretrained=False)
        trainer.train()
-            self.check_saved_checkpoints(tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False)
+        self.check_saved_checkpoints(tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), False)
    @require_safetensors
    def test_safe_checkpoints(self):
        for save_safetensors in [True, False]:
-            with tempfile.TemporaryDirectory() as tmpdir:
+            tmp_dir = self.get_auto_remove_tmp_dir()
-                trainer = get_regression_trainer(output_dir=tmpdir, save_steps=5, save_safetensors=save_safetensors)
+            trainer = get_regression_trainer(output_dir=tmp_dir, save_steps=5, save_safetensors=save_safetensors)
            trainer.train()
            self.check_saved_checkpoints(
-                    tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), safe_weights=save_safetensors
+                tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), safe_weights=save_safetensors
            )
            # With a regular model that is not a PreTrainedModel
-            with tempfile.TemporaryDirectory() as tmpdir:
+            tmp_dir = self.get_auto_remove_tmp_dir()
            trainer = get_regression_trainer(
-                    output_dir=tmpdir, save_steps=5, pretrained=False, save_safetensors=save_safetensors
+                output_dir=tmp_dir, save_steps=5, pretrained=False, save_safetensors=save_safetensors
            )
            trainer.train()
            self.check_saved_checkpoints(
-                    tmpdir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
+                tmp_dir, 5, int(self.n_epochs * 64 / self.batch_size), False, safe_weights=save_safetensors
            )
    def test_load_best_model_with_save(self):
-        with tempfile.TemporaryDirectory() as tmpdir:
+        tmp_dir = self.get_auto_remove_tmp_dir()
        trainer = get_regression_trainer(
-                output_dir=tmpdir,
+            output_dir=tmp_dir,
            save_steps=5,
            evaluation_strategy="steps",
            eval_steps=5,
@@ -2750,19 +2753,19 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        trainer.train()
        # Check that we have the last known step:
        assert os.path.exists(
-                os.path.join(tmpdir, f"checkpoint-{trainer.state.max_steps}")
+            os.path.join(tmp_dir, f"checkpoint-{trainer.state.max_steps}")
        ), f"Could not find checkpoint-{trainer.state.max_steps}"
        # And then check the last step
-            assert os.path.exists(os.path.join(tmpdir, "checkpoint-9")), "Could not find checkpoint-9"
+        assert os.path.exists(os.path.join(tmp_dir, "checkpoint-9")), "Could not find checkpoint-9"
        # Now test that using a limit works
        # Should result in:
        # - save at step 5 (but is deleted)
        # - save at step 10 (loaded in at the end when `load_best_model=True`)
        # - save at step 11
-        with tempfile.TemporaryDirectory() as tmpdir:
+        tmp_dir = self.get_auto_remove_tmp_dir()
        trainer = get_regression_trainer(
-                output_dir=tmpdir,
+            output_dir=tmp_dir,
            save_steps=5,
            evaluation_strategy="steps",
            eval_steps=5,
@@ -2772,19 +2775,17 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        )
        trainer.train()
        # Check that we have the last known step:
-            assert os.path.exists(os.path.join(tmpdir, "checkpoint-11")), "Could not find checkpoint-11"
+        assert os.path.exists(os.path.join(tmp_dir, "checkpoint-11")), "Could not find checkpoint-11"
        # And then check the last multiple
-            assert os.path.exists(os.path.join(tmpdir, "checkpoint-10")), "Could not find checkpoint-10"
+        assert os.path.exists(os.path.join(tmp_dir, "checkpoint-10")), "Could not find checkpoint-10"
        # Finally check that we don't have an old one
-            assert not os.path.exists(os.path.join(tmpdir, "checkpoint-5")), "Found checkpoint-5, limit not respected"
+        assert not os.path.exists(os.path.join(tmp_dir, "checkpoint-5")), "Found checkpoint-5, limit not respected"
        # Finally check that the right model was loaded in, checkpoint-10
        # this goes by the last `eval` step check to do so, so it won't be
        # the last model *saved*
        model_state = trainer.model.state_dict()
-            final_model_weights = safetensors.torch.load_file(
+        final_model_weights = safetensors.torch.load_file(os.path.join(tmp_dir, "checkpoint-10", "model.safetensors"))
                os.path.join(tmpdir, "checkpoint-10", "model.safetensors")
            )
        for k, v in model_state.items():
            assert torch.allclose(v, final_model_weights[k]), f"{k} is not the same"
@@ -2794,8 +2795,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        # since wrapping primarily happens on multi-gpu setup we want multiple gpus to test for
        # example DataParallel(DataParallel(model))
-        with tempfile.TemporaryDirectory() as tmp_dir:
+        trainer = get_regression_trainer(output_dir=self.get_auto_remove_tmp_dir())
            trainer = get_regression_trainer(output_dir=tmp_dir)
        trainer.train()
        model_wrapped_before = trainer.model_wrapped
        trainer.train()
@@ -2808,9 +2808,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        # save_steps, the checkpoint will resume training at epoch 2 or more (so the data seen by the model
        # won't be the same since the training dataloader is shuffled).
-        with tempfile.TemporaryDirectory() as tmpdir:
+        tmp_dir = self.get_auto_remove_tmp_dir()
        kwargs = {
-                "output_dir": tmpdir,
+            "output_dir": tmp_dir,
            "train_len": 128,
            "save_steps": 5,
            "learning_rate": 0.1,
@@ -2821,7 +2821,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        (a, b) = trainer.model.a.item(), trainer.model.b.item()
        state = dataclasses.asdict(trainer.state)
-            checkpoint = os.path.join(tmpdir, "checkpoint-5")
+        checkpoint = os.path.join(tmp_dir, "checkpoint-5")
        # Reinitialize trainer
        trainer = get_regression_trainer(**kwargs)
@@ -2834,7 +2834,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        self.check_trainer_state_are_the_same(state, state1)
        # Now check with a later checkpoint that it also works when we span over one epoch
-            checkpoint = os.path.join(tmpdir, "checkpoint-15")
+        checkpoint = os.path.join(tmp_dir, "checkpoint-15")
        # Reinitialize trainer and load model
        trainer = get_regression_trainer(**kwargs)
@@ -2847,9 +2847,9 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        self.check_trainer_state_are_the_same(state, state1)
        # With a regular model that is not a PreTrainedModel
-        with tempfile.TemporaryDirectory() as tmpdir:
+        tmp_dir = self.get_auto_remove_tmp_dir()
        kwargs = {
-                "output_dir": tmpdir,
+            "output_dir": tmp_dir,
            "train_len": 128,
            "save_steps": 5,
            "learning_rate": 0.1,
@@ -2861,7 +2861,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        (a, b) = trainer.model.a.item(), trainer.model.b.item()
        state = dataclasses.asdict(trainer.state)
-            checkpoint = os.path.join(tmpdir, "checkpoint-5")
+        checkpoint = os.path.join(tmp_dir, "checkpoint-5")
        # Reinitialize trainer and load model
        trainer = get_regression_trainer(**kwargs)
@@ -2874,7 +2874,7 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        self.check_trainer_state_are_the_same(state, state1)
        # Now check with a later checkpoint that it also works when we span over one epoch
-            checkpoint = os.path.join(tmpdir, "checkpoint-15")
+        checkpoint = os.path.join(tmp_dir, "checkpoint-15")
        # Reinitialize trainer and load model
        trainer = get_regression_trainer(**kwargs)
@@ -2889,15 +2889,15 @@ class TrainerIntegrationTest(TestCasePlus, TrainerIntegrationCommon):
        # Now check failures
        # 1. fail to find a bogus checkpoint
-        with tempfile.TemporaryDirectory() as tmpdir:
+        tmp_dir = self.get_auto_remove_tmp_dir()
-            trainer = get_regression_trainer(output_dir=tmpdir)
+        trainer = get_regression_trainer(output_dir=tmp_dir)
        with self.assertRaises(Exception) as context:
            trainer.train(resume_from_checkpoint=f"{checkpoint}-bogus")
        self.assertTrue("Can't find a valid checkpoint at" in str(context.exception))
        # 2. fail to find any checkpoint - due a fresh output_dir
-        with tempfile.TemporaryDirectory() as tmpdir:
+        tmp_dir = self.get_auto_remove_tmp_dir()
-            trainer = get_regression_trainer(output_dir=tmpdir)
+        trainer = get_regression_trainer(output_dir=tmp_dir)
        with self.assertRaises(Exception) as context:
            trainer.train(resume_from_checkpoint=True)
        self.assertTrue("No valid checkpoint found in output directory" in str(context.exception))