From d28b7aa8cb2e1d10ec5acc5e214faf9525a64a46 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Mon, 6 Jun 2022 07:49:25 -0700 Subject: [PATCH] [deepspeed / testing] reset global state (#17553) * [deepspeed] fix load_best_model test * [deepspeed] add state reset on unittest tearDown --- src/transformers/deepspeed.py | 6 ++++ tests/deepspeed/test_deepspeed.py | 53 +++++++++++++++++++------------ 2 files changed, 38 insertions(+), 21 deletions(-) diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py index e29820c92e..9fa22d4629 100644 --- a/src/transformers/deepspeed.py +++ b/src/transformers/deepspeed.py @@ -295,6 +295,12 @@ def set_hf_deepspeed_config(hf_deepspeed_config_obj): _hf_deepspeed_config_weak_ref = weakref.ref(hf_deepspeed_config_obj) +def unset_hf_deepspeed_config(): + # useful for unit tests to ensure the global state doesn't leak - call from `tearDown` method + global _hf_deepspeed_config_weak_ref + _hf_deepspeed_config_weak_ref = None + + def is_deepspeed_zero3_enabled(): if _hf_deepspeed_config_weak_ref is not None and _hf_deepspeed_config_weak_ref() is not None: return _hf_deepspeed_config_weak_ref().is_zero3() diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index c35344f6dc..65ef9416cb 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -25,7 +25,7 @@ import datasets from parameterized import parameterized from tests.trainer.test_trainer import TrainerIntegrationCommon # noqa from transformers import AutoModel, TrainingArguments, is_torch_available, logging -from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available +from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available, unset_hf_deepspeed_config from transformers.testing_utils import ( CaptureLogger, CaptureStd, @@ -161,6 +161,12 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" ) + def tearDown(self): + super().tearDown() + + # reset the ds config global so that tests state doesn't leak + unset_hf_deepspeed_config() + def test_init_zero3_fp16(self): # test that zero.Init() works correctly under zero3/fp16 ds_config = { @@ -229,6 +235,12 @@ class TrainerIntegrationDeepSpeedWithCustomConfig(TestCasePlus): zero3=config_zero3, ) + def tearDown(self): + super().tearDown() + + # reset the ds config global so that tests state doesn't leak + unset_hf_deepspeed_config() + def get_config_dict(self, stage): # As some tests modify the dict, always make a copy return deepcopy(self.ds_config_dict[stage]) @@ -754,6 +766,25 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T with mockenv_context(**self.dist_env_1_gpu): + args_dict = { + "per_gpu_train_batch_size": 1, + "per_gpu_eval_batch_size": 1, + "gradient_accumulation_steps": 1, + "learning_rate": 1e-4, + "num_train_epochs": 1, + "do_train": True, + "do_eval": True, + "optim": "adafactor", + "evaluation_strategy": "steps", + "eval_steps": 1, + "save_strategy": "steps", + "save_steps": 1, + "load_best_model_at_end": True, + "max_steps": 1, + "deepspeed": ds_config_dict, + } + + training_args = TrainingArguments(output_dir, **args_dict) tokenizer = T5Tokenizer.from_pretrained(T5_TINY) model = T5ForConditionalGeneration.from_pretrained(T5_TINY) @@ -788,26 +819,6 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T train_dataset, eval_dataset = get_dataset() - args_dict = { - "per_gpu_train_batch_size": 1, - "per_gpu_eval_batch_size": 1, - "gradient_accumulation_steps": 1, - "learning_rate": 1e-4, - "num_train_epochs": 1, - "do_train": True, - "do_eval": True, - "optim": "adafactor", - "evaluation_strategy": "steps", - "eval_steps": 1, - "save_strategy": "steps", - "save_steps": 1, - "load_best_model_at_end": True, - "max_steps": 1, - "deepspeed": ds_config_dict, - } - - training_args = TrainingArguments(output_dir, **args_dict) - trainer = Trainer( model=model, tokenizer=tokenizer,