[deepspeed / testing] reset global state (#17553)

* [deepspeed] fix load_best_model test * [deepspeed] add state reset on unittest tearDown
2022-06-06 07:49:25 -07:00
parent 34a886fce3
commit d28b7aa8cb
2 changed files with 38 additions and 21 deletions
--- a/tests/deepspeed/test_deepspeed.py
+++ b/tests/deepspeed/test_deepspeed.py
@@ -25,7 +25,7 @@ import datasets
 from parameterized import parameterized
 from tests.trainer.test_trainer import TrainerIntegrationCommon  # noqa
 from transformers import AutoModel, TrainingArguments, is_torch_available, logging
-from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available
+from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available, unset_hf_deepspeed_config
 from transformers.testing_utils import (
    CaptureLogger,
    CaptureStd,
@@ -161,6 +161,12 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
            MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
        )

+    def tearDown(self):
+        super().tearDown()
+
+        # reset the ds config global so that tests state doesn't leak
+        unset_hf_deepspeed_config()
+
    def test_init_zero3_fp16(self):
        # test that zero.Init() works correctly under zero3/fp16
        ds_config = {
@@ -229,6 +235,12 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus):
            zero3=config_zero3,
        )

+    def tearDown(self):
+        super().tearDown()
+
+        # reset the ds config global so that tests state doesn't leak
+        unset_hf_deepspeed_config()
+
    def get_config_dict(self, stage):
        # As some tests modify the dict, always make a copy
        return deepcopy(self.ds_config_dict[stage])
@@ -754,6 +766,25 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T

        with mockenv_context(**self.dist_env_1_gpu):

+            args_dict = {
+                "per_gpu_train_batch_size": 1,
+                "per_gpu_eval_batch_size": 1,
+                "gradient_accumulation_steps": 1,
+                "learning_rate": 1e-4,
+                "num_train_epochs": 1,
+                "do_train": True,
+                "do_eval": True,
+                "optim": "adafactor",
+                "evaluation_strategy": "steps",
+                "eval_steps": 1,
+                "save_strategy": "steps",
+                "save_steps": 1,
+                "load_best_model_at_end": True,
+                "max_steps": 1,
+                "deepspeed": ds_config_dict,
+            }
+
+            training_args = TrainingArguments(output_dir, **args_dict)
            tokenizer = T5Tokenizer.from_pretrained(T5_TINY)
            model = T5ForConditionalGeneration.from_pretrained(T5_TINY)

@@ -788,26 +819,6 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T

            train_dataset, eval_dataset = get_dataset()

-            args_dict = {
-                "per_gpu_train_batch_size": 1,
-                "per_gpu_eval_batch_size": 1,
-                "gradient_accumulation_steps": 1,
-                "learning_rate": 1e-4,
-                "num_train_epochs": 1,
-                "do_train": True,
-                "do_eval": True,
-                "optim": "adafactor",
-                "evaluation_strategy": "steps",
-                "eval_steps": 1,
-                "save_strategy": "steps",
-                "save_steps": 1,
-                "load_best_model_at_end": True,
-                "max_steps": 1,
-                "deepspeed": ds_config_dict,
-            }
-
-            training_args = TrainingArguments(output_dir, **args_dict)
-
            trainer = Trainer(
                model=model,
                tokenizer=tokenizer,