accelerate deepspeed and gradient accumulation integrate (#23236)
* mixed precision support via accelerate * fix issues * fix for the sharded ddp case * fix flax and tf failing tests * `refactor the place to create `Accelerator` object * move ddp prep to accelerate * fix 😅 * resolving comments * move fsdp handling to accelerate * fixex * fix saving * shift torch dynamo handling to accelerate * shift deepspeed integration and save & load utils to accelerate * fix accelerate launcher support * oops * fix 🐛 * save ckpt fix * Trigger CI * nasty 🐛 😅 * as deepspeed needs grad_acc fixes, transfer grad_acc to accelerate * make tests happy * quality ✨ * loss tracked needs to account for grad_acc * fixing the deepspeed tests * quality ✨ * 😅😅😅 * tests 😡 * quality ✨ * Trigger CI * resolve comments and fix the issue with the previous merge from branch * Trigger CI * accelerate took over deepspeed integration --------- Co-authored-by: Stas Bekman <stas@stason.org>
This commit is contained in:
committed by
GitHub
parent
88f50a1e89
commit
a73b1d59a3
@@ -365,16 +365,19 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
||||
self.assertNotEqual(new_a, a)
|
||||
|
||||
def test_hf_scheduler_ds_optimizer(self):
|
||||
a = 0
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
ds_config_zero2_dict = self.get_config_dict(ZERO2)
|
||||
del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler
|
||||
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
|
||||
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
||||
trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
|
||||
trainer.train()
|
||||
new_a = trainer.model.a.item()
|
||||
self.assertNotEqual(new_a, a)
|
||||
with self.assertRaises(Exception) as context:
|
||||
trainer.train()
|
||||
self.assertIn(
|
||||
"Found `optimizer` configured in the DeepSpeed config, but no `scheduler`. "
|
||||
"Please configure a scheduler in the DeepSpeed config.",
|
||||
str(context.exception),
|
||||
)
|
||||
|
||||
@require_deepspeed_aio
|
||||
def test_stage3_nvme_offload(self):
|
||||
@@ -751,6 +754,8 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
||||
config = deepspeed_config()
|
||||
self.assertTrue(bool(config), "Deepspeed config should be accessible")
|
||||
|
||||
# with accelerate integration below line is additionally required for this test to pass
|
||||
trainer.accelerator.state._reset_state()
|
||||
del trainer
|
||||
# now weakref should gc the global and we shouldn't get anything here
|
||||
config = deepspeed_config()
|
||||
@@ -783,8 +788,8 @@ class TrainerIntegrationDeepSpeed(TrainerIntegrationDeepSpeedWithCustomConfig, T
|
||||
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
args_dict = {
|
||||
"per_gpu_train_batch_size": 1,
|
||||
"per_gpu_eval_batch_size": 1,
|
||||
"per_device_train_batch_size": 1,
|
||||
"per_device_eval_batch_size": 1,
|
||||
"gradient_accumulation_steps": 1,
|
||||
"learning_rate": 1e-4,
|
||||
"num_train_epochs": 1,
|
||||
|
||||
Reference in New Issue
Block a user