accelerate deepspeed and gradient accumulation integrate (#23236)

* mixed precision support via accelerate

* fix issues

* fix for the sharded ddp case

* fix flax and tf failing tests

* `refactor the place to create `Accelerator` object

* move ddp prep to accelerate

* fix 😅

* resolving comments

* move fsdp handling to accelerate

* fixex

* fix saving

* shift torch dynamo handling to accelerate

* shift deepspeed integration and save & load utils to accelerate

* fix accelerate launcher support

* oops

* fix 🐛

* save ckpt fix

* Trigger CI

* nasty 🐛 😅

* as deepspeed needs grad_acc fixes, transfer grad_acc to accelerate

* make tests happy

* quality 

* loss tracked needs to account for grad_acc

* fixing the deepspeed tests

* quality 

* 😅😅😅

* tests 😡

* quality 

* Trigger CI

* resolve comments and fix the issue with the previous merge from branch

* Trigger CI

* accelerate took over deepspeed integration

---------

Co-authored-by: Stas Bekman <stas@stason.org>
This commit is contained in:
Sourab Mangrulkar
2023-05-31 15:16:22 +05:30
committed by GitHub
parent 88f50a1e89
commit a73b1d59a3
8 changed files with 164 additions and 161 deletions

View File

@@ -365,16 +365,19 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
self.assertNotEqual(new_a, a)
def test_hf_scheduler_ds_optimizer(self):
a = 0
with mockenv_context(**self.dist_env_1_gpu):
ds_config_zero2_dict = self.get_config_dict(ZERO2)
del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
trainer.train()
new_a = trainer.model.a.item()
self.assertNotEqual(new_a, a)
with self.assertRaises(Exception) as context:
trainer.train()
self.assertIn(
"Found `optimizer` configured in the DeepSpeed config, but no `scheduler`. "
"Please configure a scheduler in the DeepSpeed config.",
str(context.exception),
)
@require_deepspeed_aio
def test_stage3_nvme_offload(self):
@@ -751,6 +754,8 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
config = deepspeed_config()
self.assertTrue(bool(config), "Deepspeed config should be accessible")
# with accelerate integration below line is additionally required for this test to pass
trainer.accelerator.state._reset_state()
del trainer
# now weakref should gc the global and we shouldn't get anything here
config = deepspeed_config()
@@ -783,8 +788,8 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
with mockenv_context(**self.dist_env_1_gpu):
args_dict = {
"per_gpu_train_batch_size": 1,
"per_gpu_eval_batch_size": 1,
"per_device_train_batch_size": 1,
"per_device_eval_batch_size": 1,
"gradient_accumulation_steps": 1,
"learning_rate": 1e-4,
"num_train_epochs": 1,