[trainer] deepspeed integration (#9211)
* deepspeed integration * style * add test * ds wants to do its own backward * fp16 assert * Update src/transformers/training_args.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * style * for clarity extract what args are being passed to deepspeed * introduce the concept of self.wrapped_model * s/self.wrapped_model/self.model_wrapped/ * complete transition to self.wrapped_model / self.model * fix * doc * give ds its own init * add custom overrides, handle bs correctly * fix test * clean up model_init logic, fix small bug * complete fix * collapse --deepspeed_config into --deepspeed * style * start adding doc notes * style * implement hf2ds optimizer and scheduler configuration remapping * oops * call get_num_training_steps absolutely when needed * workaround broken auto-formatter * deepspeed_config arg is no longer needed - fixed in deepspeed master * use hf's fp16 args in config * clean * start on the docs * rebase cleanup * finish up --fp16 * clarify the supported stages * big refactor thanks to discovering deepspeed.init_distributed * cleanup * revert fp16 part * add checkpoint-support * more init ds into integrations * extend docs * cleanup * unfix docs * clean up old code * imports * move docs * fix logic * make it clear which file it's referring to * document nodes/gpus * style * wrong format * style * deepspeed handles gradient clipping * easier to read * major doc rewrite * Apply suggestions from code review Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * docs * switch to AdamW optimizer * style * Apply suggestions from code review Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * clarify doc Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
This commit is contained in:
47
examples/seq2seq/ds_config.json
Normal file
47
examples/seq2seq/ds_config.json
Normal file
@@ -0,0 +1,47 @@
|
||||
{
|
||||
"fp16": {
|
||||
"enabled": true,
|
||||
"loss_scale": 0,
|
||||
"loss_scale_window": 1000,
|
||||
"hysteresis": 2,
|
||||
"min_loss_scale": 1
|
||||
},
|
||||
|
||||
"zero_optimization": {
|
||||
"stage": 2,
|
||||
"allgather_partitions": true,
|
||||
"allgather_bucket_size": 2e8,
|
||||
"overlap_comm": true,
|
||||
"reduce_scatter": true,
|
||||
"reduce_bucket_size": 2e8,
|
||||
"contiguous_gradients": true,
|
||||
"cpu_offload": true
|
||||
},
|
||||
|
||||
"zero_allow_untested_optimizer": true,
|
||||
|
||||
"optimizer": {
|
||||
"type": "AdamW",
|
||||
"params": {
|
||||
"lr": 3e-5,
|
||||
"betas": [
|
||||
0.8,
|
||||
0.999
|
||||
],
|
||||
"eps": 1e-8,
|
||||
"weight_decay": 3e-7
|
||||
}
|
||||
},
|
||||
|
||||
"scheduler": {
|
||||
"type": "WarmupLR",
|
||||
"params": {
|
||||
"warmup_min_lr": 0,
|
||||
"warmup_max_lr": 3e-5,
|
||||
"warmup_num_steps": 500
|
||||
}
|
||||
},
|
||||
|
||||
"steps_per_print": 2000,
|
||||
"wall_clock_breakdown": false
|
||||
}
|
||||
@@ -18,7 +18,7 @@ import unittest
|
||||
from unittest.mock import patch
|
||||
|
||||
from transformers.file_utils import is_apex_available
|
||||
from transformers.integrations import is_fairscale_available
|
||||
from transformers.integrations import is_deepspeed_available, is_fairscale_available
|
||||
from transformers.testing_utils import (
|
||||
TestCasePlus,
|
||||
execute_subprocess_async,
|
||||
@@ -49,6 +49,17 @@ def require_fairscale(test_case):
|
||||
return test_case
|
||||
|
||||
|
||||
# a candidate for testing_utils
|
||||
def require_deepspeed(test_case):
|
||||
"""
|
||||
Decorator marking a test that requires deepspeed
|
||||
"""
|
||||
if not is_deepspeed_available():
|
||||
return unittest.skip("test requires deepspeed")(test_case)
|
||||
else:
|
||||
return test_case
|
||||
|
||||
|
||||
# a candidate for testing_utils
|
||||
def require_apex(test_case):
|
||||
"""
|
||||
@@ -61,8 +72,8 @@ def require_apex(test_case):
|
||||
|
||||
|
||||
class TestFinetuneTrainer(TestCasePlus):
|
||||
def finetune_trainer_quick(self, distributed=None, extra_args_str=None):
|
||||
output_dir = self.run_trainer(1, "12", MBART_TINY, 1, distributed, extra_args_str)
|
||||
def finetune_trainer_quick(self, distributed=None, deepspeed=False, extra_args_str=None):
|
||||
output_dir = self.run_trainer(1, "12", MBART_TINY, 1, distributed, deepspeed, extra_args_str)
|
||||
logs = TrainerState.load_from_json(os.path.join(output_dir, "trainer_state.json")).log_history
|
||||
eval_metrics = [log for log in logs if "eval_loss" in log.keys()]
|
||||
first_step_stats = eval_metrics[0]
|
||||
@@ -96,6 +107,11 @@ class TestFinetuneTrainer(TestCasePlus):
|
||||
def test_finetune_trainer_apex(self):
|
||||
self.finetune_trainer_quick(extra_args_str="--fp16 --fp16_backend=apex")
|
||||
|
||||
@require_torch_multi_gpu
|
||||
@require_deepspeed
|
||||
def test_finetune_trainer_deepspeed(self):
|
||||
self.finetune_trainer_quick(deepspeed=True)
|
||||
|
||||
@slow
|
||||
def test_finetune_trainer_slow(self):
|
||||
# There is a missing call to __init__process_group somewhere
|
||||
@@ -125,6 +141,7 @@ class TestFinetuneTrainer(TestCasePlus):
|
||||
model_name: str,
|
||||
num_train_epochs: int,
|
||||
distributed: bool = False,
|
||||
deepspeed: bool = False,
|
||||
extra_args_str: str = None,
|
||||
):
|
||||
data_dir = self.examples_dir / "seq2seq/test_data/wmt_en_ro"
|
||||
@@ -164,7 +181,15 @@ class TestFinetuneTrainer(TestCasePlus):
|
||||
if extra_args_str is not None:
|
||||
args.extend(extra_args_str.split())
|
||||
|
||||
if distributed:
|
||||
if deepspeed:
|
||||
ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config.json".split()
|
||||
distributed_args = f"""
|
||||
{self.test_file_dir}/finetune_trainer.py
|
||||
""".split()
|
||||
cmd = ["deepspeed"] + distributed_args + args + ds_args
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
|
||||
elif distributed:
|
||||
n_gpu = get_gpu_count()
|
||||
distributed_args = f"""
|
||||
-m torch.distributed.launch
|
||||
@@ -173,6 +198,7 @@ class TestFinetuneTrainer(TestCasePlus):
|
||||
""".split()
|
||||
cmd = [sys.executable] + distributed_args + args
|
||||
execute_subprocess_async(cmd, env=self.get_env())
|
||||
|
||||
else:
|
||||
testargs = ["finetune_trainer.py"] + args
|
||||
with patch.object(sys, "argv", testargs):
|
||||
|
||||
Reference in New Issue
Block a user