From 42f359d015aee3835490bdcfa20df657a4d97049 Mon Sep 17 00:00:00 2001 From: Olatunji Ruwase Date: Mon, 30 Aug 2021 10:01:06 -0700 Subject: [PATCH] Use DS callable API to allow hf_scheduler + ds_optimizer (#13216) * Use DS callable API to allow hf_scheduler + ds_optimizer * Preserve backward-compatibility * Restore backward compatibility * Tweak arg positioning * Tweak arg positioning * bump the required version * Undo indent * Update src/transformers/trainer.py * style Co-authored-by: Stas Bekman Co-authored-by: Stas Bekman --- setup.py | 2 +- src/transformers/deepspeed.py | 28 +++++++------------ src/transformers/dependency_versions_table.py | 2 +- src/transformers/trainer.py | 12 +++++--- tests/deepspeed/test_deepspeed.py | 11 +++----- 5 files changed, 24 insertions(+), 31 deletions(-) diff --git a/setup.py b/setup.py index 6d7062668e..f7e28b2f5d 100644 --- a/setup.py +++ b/setup.py @@ -91,7 +91,7 @@ _deps = [ "cookiecutter==1.7.2", "dataclasses", "datasets", - "deepspeed>=0.4.3", + "deepspeed>=0.5.1", "docutils==0.16.0", "fairscale>0.3", "faiss-cpu", diff --git a/src/transformers/deepspeed.py b/src/transformers/deepspeed.py index 63cac7a67f..79bc3db7ac 100644 --- a/src/transformers/deepspeed.py +++ b/src/transformers/deepspeed.py @@ -311,13 +311,13 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): # 1. DS scheduler + DS optimizer: Yes # 2. HF scheduler + HF optimizer: Yes # 3. DS scheduler + HF optimizer: Yes - # 4. HF scheduler + DS optimizer: No + # 4. HF scheduler + DS optimizer: Yes # # Unless Offload is enabled in which case it's: # 1. DS scheduler + DS optimizer: Yes # 2. HF scheduler + HF optimizer: Mostly* # 3. DS scheduler + HF optimizer: Mostly* - # 4. HF scheduler + DS optimizer: No + # 4. HF scheduler + DS optimizer: Yes # # Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB) @@ -336,28 +336,20 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. # But trainer uses AdamW by default. - trainer.create_optimizer() - optimizer = trainer.optimizer + optimizer = trainer.create_optimizer() # To use other optimizers requires voiding warranty with: `zero_allow_untested_optimizer` config["zero_allow_untested_optimizer"] = True - # DS schedulers (deepspeed/runtime/lr_schedules.py): - # - # DS name | --lr_scheduler_type | HF func | Notes - # -------------| ---------------------|-----------------------------------|-------------------- - # LRRangeTest | na | na | LRRT - # OneCycle | na | na | 1CLR - # WarmupLR | constant_with_warmup | get_constant_schedule_with_warmup | w/ warmup_min_lr=0 - # WarmupDecayLR| linear | get_linear_schedule_with_warmup | + def _lr_scheduler_callable(optimizer): + return trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer) + lr_scheduler = None if "scheduler" not in config: - if "optimizer" in config: - # to make this option work, we need to init DS optimizer first, then init HS scheduler, - # then pass the HS scheduler to DS init, which is not possible at the moment - raise ValueError("At the moment HF scheduler + DeepSpeed optimizer combination is not possible") + if optimizer is None: + # Optimizer is not available, so use callable to defer lr_scheduler creation to DS init + lr_scheduler = _lr_scheduler_callable else: - trainer.create_scheduler(num_training_steps=num_training_steps) - lr_scheduler = trainer.lr_scheduler + lr_scheduler = trainer.create_scheduler(num_training_steps=num_training_steps, optimizer=optimizer) # keep for quick debug: # from pprint import pprint; pprint(config) diff --git a/src/transformers/dependency_versions_table.py b/src/transformers/dependency_versions_table.py index 1dd163834a..848951d13c 100644 --- a/src/transformers/dependency_versions_table.py +++ b/src/transformers/dependency_versions_table.py @@ -8,7 +8,7 @@ deps = { "cookiecutter": "cookiecutter==1.7.2", "dataclasses": "dataclasses", "datasets": "datasets", - "deepspeed": "deepspeed>=0.4.3", + "deepspeed": "deepspeed>=0.5.1", "docutils": "docutils==0.16.0", "fairscale": "fairscale>0.3", "faiss-cpu": "faiss-cpu", diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py index 22657b8a05..caa9d2798c 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py @@ -768,7 +768,7 @@ class Trainer: and/or :obj:`create_scheduler`) in a subclass. """ self.create_optimizer() - self.create_scheduler(num_training_steps) + self.create_scheduler(num_training_steps=num_training_steps, optimizer=self.optimizer) def create_optimizer(self): """ @@ -813,9 +813,12 @@ class Trainer: if is_sagemaker_mp_enabled(): self.optimizer = smp.DistributedOptimizer(self.optimizer) - def create_scheduler(self, num_training_steps: int): + return self.optimizer + + def create_scheduler(self, num_training_steps: int, optimizer: torch.optim.Optimizer = None): """ - Setup the scheduler. The optimizer of the trainer must have been set up before this method is called. + Setup the scheduler. The optimizer of the trainer must have been set up either before this method is called or + passed as an argument. Args: num_training_steps (int): The number of training steps to do. @@ -823,10 +826,11 @@ class Trainer: if self.lr_scheduler is None: self.lr_scheduler = get_scheduler( self.args.lr_scheduler_type, - self.optimizer, + optimizer=self.optimizer if optimizer is None else optimizer, num_warmup_steps=self.args.get_warmup_steps(num_training_steps), num_training_steps=num_training_steps, ) + return self.lr_scheduler def num_examples(self, dataloader: DataLoader) -> int: """ diff --git a/tests/deepspeed/test_deepspeed.py b/tests/deepspeed/test_deepspeed.py index a4acc13c86..a7ba14b022 100644 --- a/tests/deepspeed/test_deepspeed.py +++ b/tests/deepspeed/test_deepspeed.py @@ -292,19 +292,16 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): self.assertNotEqual(new_a, a) def test_hf_scheduler_ds_optimizer(self): - # this combo is not possible at the moment + a = 0 with mockenv_context(**self.dist_env_1_gpu): ds_config_zero2_dict = self.get_config_dict(ZERO2) del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none" ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict) - with self.assertRaises(Exception) as context: - trainer.train() - self.assertTrue( - "HF scheduler + DeepSpeed optimizer combination is not possible" in str(context.exception), - f"got exception: {context.exception}", - ) + trainer.train() + new_a = trainer.model.a.item() + self.assertNotEqual(new_a, a) @require_deepspeed_aio def test_stage3_nvme_offload(self):