[Deepspeed] various fixes (#12058)
* replace deprecated config * sub_group_size was too big * complete deprecation removal
This commit is contained in:
@@ -29,13 +29,16 @@
|
||||
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"offload_optimizer": {
|
||||
"device": "cpu",
|
||||
"pin_memory": true
|
||||
},
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 2e8,
|
||||
"overlap_comm": true,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": 2e8,
|
||||
"contiguous_gradients": true,
|
||||
"cpu_offload": true
|
||||
"contiguous_gradients": true
|
||||
},
|
||||
|
||||
"gradient_accumulation_steps": "auto",
|
||||
|
||||
@@ -39,7 +39,7 @@
|
||||
},
|
||||
"overlap_comm": true,
|
||||
"contiguous_gradients": true,
|
||||
"sub_group_size": 1e14,
|
||||
"sub_group_size": 1e9,
|
||||
"reduce_bucket_size": "auto",
|
||||
"stage3_prefetch_bucket_size": "auto",
|
||||
"stage3_param_persistence_threshold": "auto",
|
||||
|
||||
@@ -269,7 +269,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
||||
ds_config_zero2_dict = self.get_config_dict(ZERO2)
|
||||
del ds_config_zero2_dict["optimizer"] # force default HF Trainer optimizer
|
||||
del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler
|
||||
ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = False
|
||||
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
|
||||
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
||||
trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
|
||||
trainer.train()
|
||||
@@ -281,7 +281,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
ds_config_zero2_dict = self.get_config_dict(ZERO2)
|
||||
del ds_config_zero2_dict["optimizer"] # force default HF Trainer optimizer
|
||||
ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = False
|
||||
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
|
||||
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
||||
trainer = get_regression_trainer(a=a, local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
|
||||
trainer.train()
|
||||
@@ -293,7 +293,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
ds_config_zero2_dict = self.get_config_dict(ZERO2)
|
||||
del ds_config_zero2_dict["scheduler"] # force default HF Trainer scheduler
|
||||
ds_config_zero2_dict["zero_optimization"]["cpu_offload"] = False
|
||||
ds_config_zero2_dict["zero_optimization"]["offload_optimizer"]["device"] = "none"
|
||||
ds_config_zero2_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
|
||||
trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_zero2_dict)
|
||||
with self.assertRaises(Exception) as context:
|
||||
@@ -326,10 +326,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
||||
ds_config_dict = self.get_config_dict(stage)
|
||||
del ds_config_dict["optimizer"] # force default HF Trainer optimizer
|
||||
# force cpu offload
|
||||
if stage == "stage2":
|
||||
ds_config_dict["zero_optimization"]["cpu_offload"] = True
|
||||
elif stage == "stage3":
|
||||
ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
|
||||
ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
|
||||
with mockenv_context(**self.dist_env_1_gpu):
|
||||
trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_dict)
|
||||
with self.assertRaises(Exception) as context:
|
||||
|
||||
Reference in New Issue
Block a user