[DeepSpeed] fp32 support (#11499)
* prep for deepspeed==0.3.16 * new version * too soon * support and test fp32 mode * troubleshooting doc start * workaround no longer needed * add fp32 doc * style * cleanup, add tf32 note * clarify * release was made
This commit is contained in:
@@ -1507,6 +1507,35 @@ and ``total_num_steps`, ``warmup_max_lr``, ``warmup_num_steps`` and ``total_num_
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
fp32 Precision
|
||||||
|
=======================================================================================================================
|
||||||
|
|
||||||
|
Deepspeed supports the full fp32 and the fp16 mixed precision.
|
||||||
|
|
||||||
|
Because of the much reduced memory needs and faster speed one gets with the fp16 mixed precision, the only time you
|
||||||
|
will want to not use it is when the model you're using doesn't behave well under this training mode. Typically this
|
||||||
|
happens when the model wasn't pretrained in the fp16 mixed precision (e.g. often this happens with bf16-pretrained
|
||||||
|
models). Such models may overflow or underflow leading to ``NaN`` loss. If this is your case then you will want to use
|
||||||
|
the full fp32 mode, by explicitly disabling the otherwise default fp16 mixed precision mode with:
|
||||||
|
|
||||||
|
.. code-block:: json
|
||||||
|
|
||||||
|
{
|
||||||
|
"fp16": {
|
||||||
|
"enabled": "false",
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
If you're using the Ampere-architecture based GPU, pytorch version 1.7 and higher will automatically switch to using
|
||||||
|
the much more efficient tf32 format for some operations, but the results will still be in fp32. For details and
|
||||||
|
benchmarks, please, see `TensorFloat-32(TF32) on Ampere devices
|
||||||
|
<https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices>`__. The document includes
|
||||||
|
instructions on how to disable this automatic conversion if for some reason you prefer not to use it.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Automatic Mixed Precision
|
Automatic Mixed Precision
|
||||||
=======================================================================================================================
|
=======================================================================================================================
|
||||||
|
|
||||||
@@ -1532,11 +1561,6 @@ and the :class:`~transformers.Trainer` will automatically enable or disable it b
|
|||||||
|
|
||||||
This mode gets enabled when ``--fp16 --fp16_backend amp`` command line args are passed.
|
This mode gets enabled when ``--fp16 --fp16_backend amp`` command line args are passed.
|
||||||
|
|
||||||
.. note::
|
|
||||||
|
|
||||||
At the moment DeepSpeed doesn't supported fp32 mode, though it will become available soon. Until then it will be
|
|
||||||
always set to ``true``.
|
|
||||||
|
|
||||||
You can also enable/disable this mode explicitly:
|
You can also enable/disable this mode explicitly:
|
||||||
|
|
||||||
.. code-block:: json
|
.. code-block:: json
|
||||||
@@ -1790,6 +1814,24 @@ stress on ``tensor([1.])``, or if you get an error where it says the parameter i
|
|||||||
larger multi-dimensional shape, this means that the parameter is partitioned and what you see is a ZeRO-3 placeholder.
|
larger multi-dimensional shape, this means that the parameter is partitioned and what you see is a ZeRO-3 placeholder.
|
||||||
|
|
||||||
|
|
||||||
|
Troubleshooting
|
||||||
|
=======================================================================================================================
|
||||||
|
|
||||||
|
* ``deepspeed`` process gets killed at startup without a traceback
|
||||||
|
|
||||||
|
If the ``deepspeed`` process gets killed at launch time without a traceback, that usually means that the program tried
|
||||||
|
to allocate more CPU memory than your system has or your process is allowed to allocate and the OS kernel killed that
|
||||||
|
process. This is because your configuration file most likely has either ``offload_optimizer`` or ``offload_param`` or
|
||||||
|
both configured to offload to ``cpu`` (or under ZeRO-2 ``cpu_offload`` is enabled). If you have NVMe, experiment with
|
||||||
|
offloading to NVMe if you're running under ZeRO-3.
|
||||||
|
|
||||||
|
Work is being done to enable estimating how much memory is needed for a specific model: `PR
|
||||||
|
<https://github.com/microsoft/DeepSpeed/pull/965>`__.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
Notes
|
Notes
|
||||||
=======================================================================================================================
|
=======================================================================================================================
|
||||||
|
|||||||
2
setup.py
2
setup.py
@@ -90,7 +90,7 @@ _deps = [
|
|||||||
"cookiecutter==1.7.2",
|
"cookiecutter==1.7.2",
|
||||||
"dataclasses",
|
"dataclasses",
|
||||||
"datasets",
|
"datasets",
|
||||||
"deepspeed>=0.3.15",
|
"deepspeed>=0.3.16",
|
||||||
"docutils==0.16.0",
|
"docutils==0.16.0",
|
||||||
"fairscale>0.3",
|
"fairscale>0.3",
|
||||||
"faiss-cpu",
|
"faiss-cpu",
|
||||||
|
|||||||
@@ -7,7 +7,7 @@ deps = {
|
|||||||
"cookiecutter": "cookiecutter==1.7.2",
|
"cookiecutter": "cookiecutter==1.7.2",
|
||||||
"dataclasses": "dataclasses",
|
"dataclasses": "dataclasses",
|
||||||
"datasets": "datasets",
|
"datasets": "datasets",
|
||||||
"deepspeed": "deepspeed>=0.3.15",
|
"deepspeed": "deepspeed>=0.3.16",
|
||||||
"docutils": "docutils==0.16.0",
|
"docutils": "docutils==0.16.0",
|
||||||
"fairscale": "fairscale>0.3",
|
"fairscale": "fairscale>0.3",
|
||||||
"faiss-cpu": "faiss-cpu",
|
"faiss-cpu": "faiss-cpu",
|
||||||
|
|||||||
@@ -374,10 +374,7 @@ class DeepSpeedConfigHF:
|
|||||||
# amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
|
# amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
|
||||||
# any here unless the user did the work
|
# any here unless the user did the work
|
||||||
config_fp16 = config.get("fp16")
|
config_fp16 = config.get("fp16")
|
||||||
# XXX: at the moment fp16 can't be False, but the fp32 solution is in works - once it's PR'ed and
|
_set_if_auto(config_fp16, "enabled", fp16_backend == "amp")
|
||||||
# merged and a new release is made, delete the next line and uncomment the one after it
|
|
||||||
_set_if_auto(config_fp16, "enabled", True)
|
|
||||||
# _set_if_auto(config_fp16, "enabled", fp16_backend == "amp")
|
|
||||||
|
|
||||||
# apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
|
# apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
|
||||||
# ZeRO features, so probably best to be avoided.
|
# ZeRO features, so probably best to be avoided.
|
||||||
|
|||||||
@@ -44,7 +44,7 @@ from .file_utils import (
|
|||||||
replace_return_docstrings,
|
replace_return_docstrings,
|
||||||
)
|
)
|
||||||
from .generation_utils import GenerationMixin
|
from .generation_utils import GenerationMixin
|
||||||
from .integrations import is_deepspeed_zero3_enabled
|
from .integrations import deepspeed_config, is_deepspeed_zero3_enabled
|
||||||
from .utils import logging
|
from .utils import logging
|
||||||
|
|
||||||
|
|
||||||
@@ -1124,10 +1124,7 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix
|
|||||||
logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
|
logger.info("Detected DeepSpeed ZeRO-3: activating zero.init() for this model")
|
||||||
# this immediately partitions the model across all gpus, to avoid the overhead in time
|
# this immediately partitions the model across all gpus, to avoid the overhead in time
|
||||||
# and memory copying it on CPU or each GPU first
|
# and memory copying it on CPU or each GPU first
|
||||||
|
with deepspeed.zero.Init(config=deepspeed_config()):
|
||||||
# XXX: param_dict will be added in deepspeed==0.3.16 and probably replaced by deepspeed_config
|
|
||||||
# with deepspeed.zero.Init(param_dict=deepspeed_config()):
|
|
||||||
with deepspeed.zero.Init():
|
|
||||||
model = cls(config, *model_args, **model_kwargs)
|
model = cls(config, *model_args, **model_kwargs)
|
||||||
else:
|
else:
|
||||||
model = cls(config, *model_args, **model_kwargs)
|
model = cls(config, *model_args, **model_kwargs)
|
||||||
|
|||||||
@@ -48,6 +48,7 @@ with ExtendSysPath(f"{bindir}/.."):
|
|||||||
set_seed(42)
|
set_seed(42)
|
||||||
MBART_TINY = "sshleifer/tiny-mbart"
|
MBART_TINY = "sshleifer/tiny-mbart"
|
||||||
T5_SMALL = "t5-small"
|
T5_SMALL = "t5-small"
|
||||||
|
T5_TINY = "patrickvonplaten/t5-tiny-random"
|
||||||
|
|
||||||
|
|
||||||
def load_json(path):
|
def load_json(path):
|
||||||
@@ -108,25 +109,31 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
|
MASTER_ADDR="localhost", MASTER_PORT="10999", RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
|
||||||
)
|
)
|
||||||
|
|
||||||
self.ds_config_file = {}
|
self.ds_config_file = dict(
|
||||||
self.ds_config_file[ZERO2] = f"{self.test_file_dir_str}/ds_config_zero2.json"
|
zero2=f"{self.test_file_dir_str}/ds_config_zero2.json",
|
||||||
self.ds_config_file[ZERO3] = f"{self.test_file_dir_str}/ds_config_zero3.json"
|
zero3=f"{self.test_file_dir_str}/ds_config_zero3.json",
|
||||||
|
)
|
||||||
|
|
||||||
# use self.get_config_dict(stage) to use these to ensure the original is not modified
|
# use self.get_config_dict(stage) to use these to ensure the original is not modified
|
||||||
self.ds_config_dict = {}
|
|
||||||
with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
|
with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
|
||||||
self.ds_config_dict[ZERO2] = json.load(f)
|
config_zero2 = json.load(f)
|
||||||
|
# by default use fp16
|
||||||
|
config_zero2["fp16"]["enabled"] = True
|
||||||
with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
|
with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
|
||||||
self.ds_config_dict[ZERO3] = json.load(f)
|
config_zero3 = json.load(f)
|
||||||
|
# by default use fp16
|
||||||
def get_config_dict(self, stage):
|
config_zero3["fp16"]["enabled"] = True
|
||||||
"""As the tests modify the dict, always make a copy"""
|
|
||||||
config = deepcopy(self.ds_config_dict[stage])
|
|
||||||
if stage == ZERO3:
|
|
||||||
# This setting slows things down, so don't enable it by default unless needed by a test.
|
# This setting slows things down, so don't enable it by default unless needed by a test.
|
||||||
# It's in the file as a demo for users since we want everything to work out of the box even if slower.
|
# It's in the file as a demo for users since we want everything to work out of the box even if slower.
|
||||||
config["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False
|
config_zero3["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False
|
||||||
return config
|
self.ds_config_dict = dict(
|
||||||
|
zero2=config_zero2,
|
||||||
|
zero3=config_zero3,
|
||||||
|
)
|
||||||
|
|
||||||
|
def get_config_dict(self, stage):
|
||||||
|
# As some tests modify the dict, always make a copy
|
||||||
|
return deepcopy(self.ds_config_dict[stage])
|
||||||
|
|
||||||
# --- These tests are enough to run on one of zero stages --- #
|
# --- These tests are enough to run on one of zero stages --- #
|
||||||
|
|
||||||
@@ -192,24 +199,6 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
|
|
||||||
# --- These tests need to run on both zero stages --- #
|
# --- These tests need to run on both zero stages --- #
|
||||||
|
|
||||||
@parameterized.expand(stages)
|
|
||||||
def test_fp32(self, stage):
|
|
||||||
ds_config_dict = self.get_config_dict(stage)
|
|
||||||
ds_config_dict["fp16"]["enabled"] = False # force non-fp16 mode
|
|
||||||
|
|
||||||
# XXX: do we go via from_pretrained in zero 3 here? need to test zero.Init(dtype=torch.float)
|
|
||||||
|
|
||||||
# XXX: rewrite this test once fp32 is supported by DeepSpeed
|
|
||||||
with mockenv_context(**self.dist_env_1_gpu):
|
|
||||||
trainer = get_regression_trainer(local_rank=0, deepspeed=ds_config_dict)
|
|
||||||
with self.assertRaises(Exception) as context:
|
|
||||||
trainer.train()
|
|
||||||
self.assertIn(
|
|
||||||
"ZeRO is only supported if fp16 is enabled",
|
|
||||||
str(context.exception),
|
|
||||||
f"got exception: {context.exception}",
|
|
||||||
)
|
|
||||||
|
|
||||||
@parameterized.expand(stages)
|
@parameterized.expand(stages)
|
||||||
def test_hf_optimizer_with_offload(self, stage):
|
def test_hf_optimizer_with_offload(self, stage):
|
||||||
# must not allow non-DS optimizer when using ZERO-offload
|
# must not allow non-DS optimizer when using ZERO-offload
|
||||||
@@ -239,7 +228,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
# it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
|
# it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
|
||||||
# to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
|
# to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
|
||||||
with mockenv_context(**self.dist_env_1_gpu):
|
with mockenv_context(**self.dist_env_1_gpu):
|
||||||
trainer = get_regression_trainer(local_rank=0, deepspeed=self.ds_config_file[stage])
|
trainer = get_regression_trainer(local_rank=0, deepspeed=self.get_config_dict(stage))
|
||||||
with CaptureLogger(deepspeed_logger) as cs:
|
with CaptureLogger(deepspeed_logger) as cs:
|
||||||
trainer.train()
|
trainer.train()
|
||||||
self.assertIn("DeepSpeed info", cs.out, "expected DeepSpeed logger output but got none")
|
self.assertIn("DeepSpeed info", cs.out, "expected DeepSpeed logger output but got none")
|
||||||
@@ -259,7 +248,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
b=b,
|
b=b,
|
||||||
local_rank=0,
|
local_rank=0,
|
||||||
train_len=8,
|
train_len=8,
|
||||||
deepspeed=self.ds_config_file[stage],
|
deepspeed=self.get_config_dict(stage),
|
||||||
per_device_train_batch_size=8,
|
per_device_train_batch_size=8,
|
||||||
logging_steps=1,
|
logging_steps=1,
|
||||||
)
|
)
|
||||||
@@ -267,7 +256,11 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
post_train_a = trainer.model.a.item()
|
post_train_a = trainer.model.a.item()
|
||||||
|
|
||||||
# XXX: for some reason the following check fails with zero3 - not a broken but a
|
# XXX: for some reason the following check fails with zero3 - not a broken but a
|
||||||
# different qualitative outcome - need to investigate at some point
|
# different qualitative outcome - as if optimizer did run
|
||||||
|
# oddly getting 1.0 for both a and b from 0.0 - there is a bug somewhere
|
||||||
|
# print(trainer.model.a.item())
|
||||||
|
# print(trainer.model.b.item())
|
||||||
|
# need to investigate at some point
|
||||||
if stage == ZERO3:
|
if stage == ZERO3:
|
||||||
return
|
return
|
||||||
|
|
||||||
@@ -298,7 +291,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
b=b,
|
b=b,
|
||||||
local_rank=0,
|
local_rank=0,
|
||||||
train_len=train_len,
|
train_len=train_len,
|
||||||
deepspeed=self.ds_config_file[stage],
|
deepspeed=self.get_config_dict(stage),
|
||||||
per_device_train_batch_size=8,
|
per_device_train_batch_size=8,
|
||||||
gradient_accumulation_steps=1,
|
gradient_accumulation_steps=1,
|
||||||
)
|
)
|
||||||
@@ -315,7 +308,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
|
|||||||
b=b,
|
b=b,
|
||||||
local_rank=0,
|
local_rank=0,
|
||||||
train_len=train_len,
|
train_len=train_len,
|
||||||
deepspeed=self.ds_config_file[stage],
|
deepspeed=self.get_config_dict(stage),
|
||||||
per_device_train_batch_size=4,
|
per_device_train_batch_size=4,
|
||||||
gradient_accumulation_steps=2,
|
gradient_accumulation_steps=2,
|
||||||
)
|
)
|
||||||
@@ -532,6 +525,35 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
|||||||
do_eval=True,
|
do_eval=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@parameterized.expand(stages)
|
||||||
|
def test_fp32_non_distributed(self, stage):
|
||||||
|
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
|
||||||
|
# therefore no quality checks, just basic completion checks are done
|
||||||
|
self.run_and_check(
|
||||||
|
stage=stage,
|
||||||
|
model_name=T5_TINY,
|
||||||
|
distributed=False,
|
||||||
|
do_train=True,
|
||||||
|
do_eval=True,
|
||||||
|
quality_checks=False,
|
||||||
|
fp16=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
@require_torch_multi_gpu
|
||||||
|
@parameterized.expand(stages)
|
||||||
|
def test_fp32_distributed(self, stage):
|
||||||
|
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
|
||||||
|
# therefore no quality checks, just basic completion checks are done
|
||||||
|
self.run_and_check(
|
||||||
|
stage=stage,
|
||||||
|
model_name=T5_TINY,
|
||||||
|
distributed=True,
|
||||||
|
do_train=True,
|
||||||
|
do_eval=True,
|
||||||
|
quality_checks=False,
|
||||||
|
fp16=False,
|
||||||
|
)
|
||||||
|
|
||||||
@parameterized.expand(stages)
|
@parameterized.expand(stages)
|
||||||
def test_resume_train_not_from_ds_checkpoint(self, stage):
|
def test_resume_train_not_from_ds_checkpoint(self, stage):
|
||||||
# do normal training and then resume not from the deepspeed checkpoint but explicitly from
|
# do normal training and then resume not from the deepspeed checkpoint but explicitly from
|
||||||
@@ -550,44 +572,50 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
|||||||
|
|
||||||
self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
|
self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
|
||||||
|
|
||||||
def do_checks(self, output_dir, do_train=True, do_eval=True):
|
def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True):
|
||||||
|
|
||||||
if do_train:
|
if do_train:
|
||||||
train_metrics = load_json(os.path.join(output_dir, "train_results.json"))
|
train_metrics = load_json(os.path.join(output_dir, "train_results.json"))
|
||||||
self.assertIn("train_samples_per_second", train_metrics)
|
self.assertIn("train_samples_per_second", train_metrics)
|
||||||
self.assertGreater(train_metrics["train_samples_per_second"], 0.5)
|
if quality_checks:
|
||||||
|
self.assertGreater(train_metrics["train_samples_per_second"], 0.5)
|
||||||
|
|
||||||
if do_eval:
|
if do_eval:
|
||||||
eval_metrics = load_json(os.path.join(output_dir, "eval_results.json"))
|
eval_metrics = load_json(os.path.join(output_dir, "eval_results.json"))
|
||||||
self.assertIn("eval_bleu", eval_metrics)
|
self.assertIn("eval_bleu", eval_metrics)
|
||||||
self.assertGreater(eval_metrics["eval_bleu"], 0)
|
if quality_checks:
|
||||||
|
self.assertGreater(eval_metrics["eval_bleu"], 1)
|
||||||
|
|
||||||
# XXX: need to do better validation beyond just that the run was successful
|
# XXX: need to do better validation beyond just that the run was successful
|
||||||
def run_and_check(
|
def run_and_check(
|
||||||
self,
|
self,
|
||||||
stage,
|
stage,
|
||||||
eval_steps=10,
|
model_name: str = T5_SMALL,
|
||||||
distributed=True,
|
eval_steps: int = 10,
|
||||||
do_train=True,
|
distributed: bool = True,
|
||||||
do_eval=True,
|
do_train: bool = True,
|
||||||
extra_args_str=None,
|
do_eval: bool = True,
|
||||||
remove_args_str=None,
|
quality_checks: bool = True,
|
||||||
|
fp16: bool = True,
|
||||||
|
extra_args_str: str = None,
|
||||||
|
remove_args_str: str = None,
|
||||||
):
|
):
|
||||||
|
|
||||||
# we are doing quality testing so using a small real model
|
# we are doing quality testing so using a small real model
|
||||||
output_dir = self.run_trainer(
|
output_dir = self.run_trainer(
|
||||||
stage=stage,
|
stage=stage,
|
||||||
model_name=T5_SMALL,
|
model_name=model_name,
|
||||||
eval_steps=eval_steps,
|
eval_steps=eval_steps,
|
||||||
num_train_epochs=1,
|
num_train_epochs=1,
|
||||||
do_train=do_train,
|
do_train=do_train,
|
||||||
do_eval=do_eval,
|
do_eval=do_eval,
|
||||||
distributed=distributed,
|
distributed=distributed,
|
||||||
|
fp16=fp16,
|
||||||
extra_args_str=extra_args_str,
|
extra_args_str=extra_args_str,
|
||||||
remove_args_str=remove_args_str,
|
remove_args_str=remove_args_str,
|
||||||
)
|
)
|
||||||
|
|
||||||
self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
|
self.do_checks(output_dir, do_train=do_train, do_eval=do_eval, quality_checks=quality_checks)
|
||||||
|
|
||||||
return output_dir
|
return output_dir
|
||||||
|
|
||||||
@@ -600,6 +628,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
|||||||
do_train: bool = False,
|
do_train: bool = False,
|
||||||
do_eval: bool = True,
|
do_eval: bool = True,
|
||||||
distributed: bool = True,
|
distributed: bool = True,
|
||||||
|
fp16: bool = True,
|
||||||
extra_args_str: str = None,
|
extra_args_str: str = None,
|
||||||
remove_args_str: str = None,
|
remove_args_str: str = None,
|
||||||
):
|
):
|
||||||
@@ -629,6 +658,9 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
|||||||
""".split()
|
""".split()
|
||||||
args.extend(["--source_prefix", '"translate English to Romanian: "'])
|
args.extend(["--source_prefix", '"translate English to Romanian: "'])
|
||||||
|
|
||||||
|
if fp16:
|
||||||
|
args.extend(["--fp16"])
|
||||||
|
|
||||||
actions = 0
|
actions = 0
|
||||||
if do_train:
|
if do_train:
|
||||||
actions += 1
|
actions += 1
|
||||||
@@ -636,7 +668,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
|||||||
f"""
|
f"""
|
||||||
--do_train
|
--do_train
|
||||||
--num_train_epochs {str(num_train_epochs)}
|
--num_train_epochs {str(num_train_epochs)}
|
||||||
--max_train_samples 100
|
--max_train_samples 16
|
||||||
--per_device_train_batch_size 2
|
--per_device_train_batch_size 2
|
||||||
--learning_rate 3e-3
|
--learning_rate 3e-3
|
||||||
""".split()
|
""".split()
|
||||||
@@ -647,7 +679,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
|||||||
args.extend(
|
args.extend(
|
||||||
"""
|
"""
|
||||||
--do_eval
|
--do_eval
|
||||||
--max_eval_samples 100
|
--max_eval_samples 16
|
||||||
--per_device_eval_batch_size 2
|
--per_device_eval_batch_size 2
|
||||||
""".split()
|
""".split()
|
||||||
)
|
)
|
||||||
@@ -688,13 +720,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
|
|||||||
--overwrite_output_dir
|
--overwrite_output_dir
|
||||||
--do_train
|
--do_train
|
||||||
--do_eval
|
--do_eval
|
||||||
--max_train_samples 10
|
--max_train_samples 16
|
||||||
--max_eval_samples 10
|
--max_eval_samples 16
|
||||||
--per_device_train_batch_size 5
|
--per_device_train_batch_size 2
|
||||||
--per_device_eval_batch_size 5
|
--per_device_eval_batch_size 2
|
||||||
--num_train_epochs 1
|
--num_train_epochs 1
|
||||||
--warmup_steps 8
|
--warmup_steps 8
|
||||||
--block_size 128
|
--block_size 64
|
||||||
|
--fp16
|
||||||
--report_to none
|
--report_to none
|
||||||
""".split()
|
""".split()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user